summaryrefslogtreecommitdiff
path: root/packages/astro-parser/src
diff options
context:
space:
mode:
authorGravatar Fred K. Schott <fkschott@gmail.com> 2021-07-20 13:37:18 -0700
committerGravatar Fred K. Schott <fkschott@gmail.com> 2021-07-20 13:37:18 -0700
commitfcfc62533116ea51e9138d7a0ef6d6f3ab285d21 (patch)
tree8dec41ad5d5a65e1232f993d74df5841d07e4bea /packages/astro-parser/src
parentb9c5b7e9807c4ecb6c3ed0596ce1605db103c644 (diff)
downloadastro-fcfc62533116ea51e9138d7a0ef6d6f3ab285d21.tar.gz
astro-fcfc62533116ea51e9138d7a0ef6d6f3ab285d21.tar.zst
astro-fcfc62533116ea51e9138d7a0ef6d6f3ab285d21.zip
Revert "fix(parser): html entities evaluated (#738)"
This reverts commit 268186c27d436dd4fe6a330af8790ceeaeb6492c.
Diffstat (limited to 'packages/astro-parser/src')
-rw-r--r--packages/astro-parser/src/parse/state/tag.ts4
-rw-r--r--packages/astro-parser/src/parse/state/text.ts3
-rw-r--r--packages/astro-parser/src/parse/utils/html.ts85
3 files changed, 88 insertions, 4 deletions
diff --git a/packages/astro-parser/src/parse/state/tag.ts b/packages/astro-parser/src/parse/state/tag.ts
index f3d30b06d..70fa9e361 100644
--- a/packages/astro-parser/src/parse/state/tag.ts
+++ b/packages/astro-parser/src/parse/state/tag.ts
@@ -2,7 +2,7 @@
import read_expression from '../read/expression.js';
import read_style from '../read/style.js';
-import { closing_tag_omitted } from '../utils/html.js';
+import { decode_character_references, closing_tag_omitted } from '../utils/html.js';
import { is_void } from '../../utils/names.js';
import { Parser } from '../index.js';
import { Directive, DirectiveType, TemplateNode, Text } from '../../interfaces.js';
@@ -533,7 +533,7 @@ export function read_sequence(parser: Parser, done: () => boolean): TemplateNode
function flush() {
if (current_chunk.raw) {
- current_chunk.data = current_chunk.raw;
+ current_chunk.data = decode_character_references(current_chunk.raw);
current_chunk.end = parser.index;
chunks.push(current_chunk);
}
diff --git a/packages/astro-parser/src/parse/state/text.ts b/packages/astro-parser/src/parse/state/text.ts
index dec284ae4..020d066fd 100644
--- a/packages/astro-parser/src/parse/state/text.ts
+++ b/packages/astro-parser/src/parse/state/text.ts
@@ -1,5 +1,6 @@
// @ts-nocheck
+import { decode_character_references } from '../utils/html.js';
import { Parser } from '../index.js';
export default function text(parser: Parser) {
@@ -24,7 +25,7 @@ export default function text(parser: Parser) {
end: parser.index,
type: 'Text',
raw: data,
- data,
+ data: decode_character_references(data),
};
parser.current().children.push(node);
diff --git a/packages/astro-parser/src/parse/utils/html.ts b/packages/astro-parser/src/parse/utils/html.ts
index 9988174f3..e4669a2db 100644
--- a/packages/astro-parser/src/parse/utils/html.ts
+++ b/packages/astro-parser/src/parse/utils/html.ts
@@ -1,3 +1,86 @@
+// @ts-nocheck
+
+import entities from './entities.js';
+
+const windows_1252 = [
+ 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376,
+];
+
+const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g');
+
+export function decode_character_references(html: string) {
+ return html.replace(entity_pattern, (match, entity) => {
+ let code;
+
+ // Handle named entities
+ if (entity[0] !== '#') {
+ code = entities[entity];
+ } else if (entity[1] === 'x') {
+ code = parseInt(entity.substring(2), 16);
+ } else {
+ code = parseInt(entity.substring(1), 10);
+ }
+
+ if (!code) {
+ return match;
+ }
+
+ return String.fromCodePoint(validate_code(code));
+ });
+}
+
+const NUL = 0;
+
+// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
+// code points with alternatives in some cases - since we're bypassing that mechanism, we need
+// to replace them ourselves
+//
+// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
+function validate_code(code: number) {
+ // line feed becomes generic whitespace
+ if (code === 10) {
+ return 32;
+ }
+
+ // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
+ if (code < 128) {
+ return code;
+ }
+
+ // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
+ // to correct the mistake or we'll end up with missing € signs and so on
+ if (code <= 159) {
+ return windows_1252[code - 128];
+ }
+
+ // basic multilingual plane
+ if (code < 55296) {
+ return code;
+ }
+
+ // UTF-16 surrogate halves
+ if (code <= 57343) {
+ return NUL;
+ }
+
+ // rest of the basic multilingual plane
+ if (code <= 65535) {
+ return code;
+ }
+
+ // supplementary multilingual plane 0x10000 - 0x1ffff
+ if (code >= 65536 && code <= 131071) {
+ return code;
+ }
+
+ // supplementary ideographic plane 0x20000 - 0x2ffff
+ if (code >= 131072 && code <= 196607) {
+ return code;
+ }
+
+ return NUL;
+}
+
// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
const disallowed_contents = new Map([
['li', new Set(['li'])],
@@ -20,7 +103,7 @@ const disallowed_contents = new Map([
// close it, like `<li>one<li>two`?
export function closing_tag_omitted(current: string, next?: string) {
if (disallowed_contents.has(current)) {
- if (!next || disallowed_contents.get(current)!.has(next)) {
+ if (!next || disallowed_contents.get(current).has(next)) {
return true;
}
}