summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Carter Snook <cartersnook04@gmail.com> 2021-07-20 14:18:42 -0500
committerGravatar GitHub <noreply@github.com> 2021-07-20 12:18:42 -0700
commit268186c27d436dd4fe6a330af8790ceeaeb6492c (patch)
tree9963a60461f3b892ddf34ecd3bea58bbb3afae70
parentd6a9afb8e186a0481dc9d8a37e682958471dd8e7 (diff)
downloadastro-268186c27d436dd4fe6a330af8790ceeaeb6492c.tar.gz
astro-268186c27d436dd4fe6a330af8790ceeaeb6492c.tar.zst
astro-268186c27d436dd4fe6a330af8790ceeaeb6492c.zip
fix(parser): html entities evaluated (#738)
-rw-r--r--packages/astro-parser/src/parse/state/tag.ts4
-rw-r--r--packages/astro-parser/src/parse/state/text.ts3
-rw-r--r--packages/astro-parser/src/parse/utils/html.ts85
-rw-r--r--packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json3
-rw-r--r--packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro11
-rw-r--r--packages/astro/test/html-encoded-characters.test.js23
6 files changed, 41 insertions, 88 deletions
diff --git a/packages/astro-parser/src/parse/state/tag.ts b/packages/astro-parser/src/parse/state/tag.ts
index 70fa9e361..f3d30b06d 100644
--- a/packages/astro-parser/src/parse/state/tag.ts
+++ b/packages/astro-parser/src/parse/state/tag.ts
@@ -2,7 +2,7 @@
import read_expression from '../read/expression.js';
import read_style from '../read/style.js';
-import { decode_character_references, closing_tag_omitted } from '../utils/html.js';
+import { closing_tag_omitted } from '../utils/html.js';
import { is_void } from '../../utils/names.js';
import { Parser } from '../index.js';
import { Directive, DirectiveType, TemplateNode, Text } from '../../interfaces.js';
@@ -533,7 +533,7 @@ export function read_sequence(parser: Parser, done: () => boolean): TemplateNode
function flush() {
if (current_chunk.raw) {
- current_chunk.data = decode_character_references(current_chunk.raw);
+ current_chunk.data = current_chunk.raw;
current_chunk.end = parser.index;
chunks.push(current_chunk);
}
diff --git a/packages/astro-parser/src/parse/state/text.ts b/packages/astro-parser/src/parse/state/text.ts
index 020d066fd..dec284ae4 100644
--- a/packages/astro-parser/src/parse/state/text.ts
+++ b/packages/astro-parser/src/parse/state/text.ts
@@ -1,6 +1,5 @@
// @ts-nocheck
-import { decode_character_references } from '../utils/html.js';
import { Parser } from '../index.js';
export default function text(parser: Parser) {
@@ -25,7 +24,7 @@ export default function text(parser: Parser) {
end: parser.index,
type: 'Text',
raw: data,
- data: decode_character_references(data),
+ data,
};
parser.current().children.push(node);
diff --git a/packages/astro-parser/src/parse/utils/html.ts b/packages/astro-parser/src/parse/utils/html.ts
index e4669a2db..9988174f3 100644
--- a/packages/astro-parser/src/parse/utils/html.ts
+++ b/packages/astro-parser/src/parse/utils/html.ts
@@ -1,86 +1,3 @@
-// @ts-nocheck
-
-import entities from './entities.js';
-
-const windows_1252 = [
- 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376,
-];
-
-const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g');
-
-export function decode_character_references(html: string) {
- return html.replace(entity_pattern, (match, entity) => {
- let code;
-
- // Handle named entities
- if (entity[0] !== '#') {
- code = entities[entity];
- } else if (entity[1] === 'x') {
- code = parseInt(entity.substring(2), 16);
- } else {
- code = parseInt(entity.substring(1), 10);
- }
-
- if (!code) {
- return match;
- }
-
- return String.fromCodePoint(validate_code(code));
- });
-}
-
-const NUL = 0;
-
-// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
-// code points with alternatives in some cases - since we're bypassing that mechanism, we need
-// to replace them ourselves
-//
-// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
-function validate_code(code: number) {
- // line feed becomes generic whitespace
- if (code === 10) {
- return 32;
- }
-
- // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
- if (code < 128) {
- return code;
- }
-
- // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
- // to correct the mistake or we'll end up with missing € signs and so on
- if (code <= 159) {
- return windows_1252[code - 128];
- }
-
- // basic multilingual plane
- if (code < 55296) {
- return code;
- }
-
- // UTF-16 surrogate halves
- if (code <= 57343) {
- return NUL;
- }
-
- // rest of the basic multilingual plane
- if (code <= 65535) {
- return code;
- }
-
- // supplementary multilingual plane 0x10000 - 0x1ffff
- if (code >= 65536 && code <= 131071) {
- return code;
- }
-
- // supplementary ideographic plane 0x20000 - 0x2ffff
- if (code >= 131072 && code <= 196607) {
- return code;
- }
-
- return NUL;
-}
-
// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
const disallowed_contents = new Map([
['li', new Set(['li'])],
@@ -103,7 +20,7 @@ const disallowed_contents = new Map([
// close it, like `<li>one<li>two`?
export function closing_tag_omitted(current: string, next?: string) {
if (disallowed_contents.has(current)) {
- if (!next || disallowed_contents.get(current).has(next)) {
+ if (!next || disallowed_contents.get(current)!.has(next)) {
return true;
}
}
diff --git a/packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json b/packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json
new file mode 100644
index 000000000..8f034781d
--- /dev/null
+++ b/packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json
@@ -0,0 +1,3 @@
+{
+ "workspaceRoot": "../../../../../"
+}
diff --git a/packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro b/packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro
new file mode 100644
index 000000000..a174c3491
--- /dev/null
+++ b/packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro
@@ -0,0 +1,11 @@
+---
+---
+<html>
+<head><title>HTML Encoded Characters</title></head>
+<body>
+ <h1>&nbsp;&nbsp;&nbsp;Hello, world&semi;</h1>
+ <div>
+ <p>Nested elements&quest; No problem&period;&nbsp;</p>
+ </div>
+</body>
+</html>
diff --git a/packages/astro/test/html-encoded-characters.test.js b/packages/astro/test/html-encoded-characters.test.js
new file mode 100644
index 000000000..e12656a3c
--- /dev/null
+++ b/packages/astro/test/html-encoded-characters.test.js
@@ -0,0 +1,23 @@
+import { suite } from 'uvu';
+import * as assert from 'uvu/assert';
+import { doc } from './test-utils.js';
+import { setup } from './helpers.js';
+
+const HtmlEncodedChars = suite('HTML Encoded Characters');
+
+setup(HtmlEncodedChars, './fixtures/html-encoded-characters');
+
+HtmlEncodedChars("doesn't decode html entities", async ({ runtime }) => {
+ const result = await runtime.load('/');
+ if (result.error) throw new Error(result.error);
+
+ const $ = doc(result.contents);
+ // Note: although this may look like it's incorrectly decoding the chars,
+ // Cheerio is showing how the browsers _should_ interpret the HTML. If it
+ // wasn't working correctly, then the spaces would have been trimmed to a
+ // single space.
+ assert.equal($('h1').html(), '&nbsp;&nbsp;&nbsp;Hello, world;');
+ assert.equal($('div p').html(), 'Nested elements? No problem.&nbsp;');
+});
+
+HtmlEncodedChars.run();