1 files changed, 143 insertions, 0 deletions
diff --git a/packages/astro-parser/src/parse/utils/html.ts b/packages/astro-parser/src/parse/utils/html.ts
new file mode 100644
index 000000000..3b406c9cc
--- /dev/null
+++ b/packages/astro-parser/src/parse/utils/html.ts
@@ -0,0 +1,143 @@
+// @ts-nocheck
+
+import entities from './entities.js';
+
+const windows_1252 = [
+  8364,
+  129,
+  8218,
+  402,
+  8222,
+  8230,
+  8224,
+  8225,
+  710,
+  8240,
+  352,
+  8249,
+  338,
+  141,
+  381,
+  143,
+  144,
+  8216,
+  8217,
+  8220,
+  8221,
+  8226,
+  8211,
+  8212,
+  732,
+  8482,
+  353,
+  8250,
+  339,
+  157,
+  382,
+  376,
+];
+
+const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g');
+
+export function decode_character_references(html: string) {
+  return html.replace(entity_pattern, (match, entity) => {
+    let code;
+
+    // Handle named entities
+    if (entity[0] !== '#') {
+      code = entities[entity];
+    } else if (entity[1] === 'x') {
+      code = parseInt(entity.substring(2), 16);
+    } else {
+      code = parseInt(entity.substring(1), 10);
+    }
+
+    if (!code) {
+      return match;
+    }
+
+    return String.fromCodePoint(validate_code(code));
+  });
+}
+
+const NUL = 0;
+
+// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
+// code points with alternatives in some cases - since we're bypassing that mechanism, we need
+// to replace them ourselves
+//
+// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
+function validate_code(code: number) {
+  // line feed becomes generic whitespace
+  if (code === 10) {
+    return 32;
+  }
+
+  // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
+  if (code < 128) {
+    return code;
+  }
+
+  // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
+  // to correct the mistake or we'll end up with missing € signs and so on
+  if (code <= 159) {
+    return windows_1252[code - 128];
+  }
+
+  // basic multilingual plane
+  if (code < 55296) {
+    return code;
+  }
+
+  // UTF-16 surrogate halves
+  if (code <= 57343) {
+    return NUL;
+  }
+
+  // rest of the basic multilingual plane
+  if (code <= 65535) {
+    return code;
+  }
+
+  // supplementary multilingual plane 0x10000 - 0x1ffff
+  if (code >= 65536 && code <= 131071) {
+    return code;
+  }
+
+  // supplementary ideographic plane 0x20000 - 0x2ffff
+  if (code >= 131072 && code <= 196607) {
+    return code;
+  }
+
+  return NUL;
+}
+
+// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
+const disallowed_contents = new Map([
+  ['li', new Set(['li'])],
+  ['dt', new Set(['dt', 'dd'])],
+  ['dd', new Set(['dt', 'dd'])],
+  ['p', new Set('address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(' '))],
+  ['rt', new Set(['rt', 'rp'])],
+  ['rp', new Set(['rt', 'rp'])],
+  ['optgroup', new Set(['optgroup'])],
+  ['option', new Set(['option', 'optgroup'])],
+  ['thead', new Set(['tbody', 'tfoot'])],
+  ['tbody', new Set(['tbody', 'tfoot'])],
+  ['tfoot', new Set(['tbody'])],
+  ['tr', new Set(['tr', 'tbody'])],
+  ['td', new Set(['td', 'th', 'tr'])],
+  ['th', new Set(['td', 'th', 'tr'])],
+]);
+
+// can this be a child of the parent element, or does it implicitly
+// close it, like `<li>one<li>two`?
+export function closing_tag_omitted(current: string, next?: string) {
+  if (disallowed_contents.has(current)) {
+    if (!next || disallowed_contents.get(current).has(next)) {
+      return true;
+    }
+  }
+
+  return false;
+}