diff options
author | 2021-04-30 16:33:35 -0500 | |
---|---|---|
committer | 2021-04-30 16:33:35 -0500 | |
commit | 4df1347156cf2632ea2f3475d3a5f8f08d197cc3 (patch) | |
tree | 9d50de89dfe62827c32a8a4046120af4ab61dc0c /packages/astro-parser/src/parse/utils/html.ts | |
parent | 1d498facc8f78a3ffbfecd05cc6ecd45e8a4a1ae (diff) | |
download | astro-4df1347156cf2632ea2f3475d3a5f8f08d197cc3.tar.gz astro-4df1347156cf2632ea2f3475d3a5f8f08d197cc3.tar.zst astro-4df1347156cf2632ea2f3475d3a5f8f08d197cc3.zip |
Migrate to `yarn` monorepo (#157)
* chore: use monorepo
* chore: scaffold astro-scripts
* chore: move tests inside packages/astro
* chore: refactor tests, add scripts
* chore: move parser to own module
* chore: move runtime to packages/astro
* fix: move parser to own package
* test: fix prettier-plugin-astro tests
* fix: tests
* chore: update package-lock
* chore: add changesets
* fix: cleanup examples
* fix: starter example
* chore: update changeset config
* chore: update changeset config
* chore: setup changeset release workflow
* chore: bump lockfiles
* chore: prism => astro-prism
* fix: tsc --emitDeclarationOnly
* chore: final cleanup, switch to yarn
* chore: add lerna
* chore: update workflows to yarn
* chore: update workflows
* chore: remove lint workflow
* chore: add astro-dev script
* chore: add symlinked README
Diffstat (limited to 'packages/astro-parser/src/parse/utils/html.ts')
-rw-r--r-- | packages/astro-parser/src/parse/utils/html.ts | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/packages/astro-parser/src/parse/utils/html.ts b/packages/astro-parser/src/parse/utils/html.ts new file mode 100644 index 000000000..3b406c9cc --- /dev/null +++ b/packages/astro-parser/src/parse/utils/html.ts @@ -0,0 +1,143 @@ +// @ts-nocheck + +import entities from './entities.js'; + +const windows_1252 = [ + 8364, + 129, + 8218, + 402, + 8222, + 8230, + 8224, + 8225, + 710, + 8240, + 352, + 8249, + 338, + 141, + 381, + 143, + 144, + 8216, + 8217, + 8220, + 8221, + 8226, + 8211, + 8212, + 732, + 8482, + 353, + 8250, + 339, + 157, + 382, + 376, +]; + +const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g'); + +export function decode_character_references(html: string) { + return html.replace(entity_pattern, (match, entity) => { + let code; + + // Handle named entities + if (entity[0] !== '#') { + code = entities[entity]; + } else if (entity[1] === 'x') { + code = parseInt(entity.substring(2), 16); + } else { + code = parseInt(entity.substring(1), 10); + } + + if (!code) { + return match; + } + + return String.fromCodePoint(validate_code(code)); + }); +} + +const NUL = 0; + +// some code points are verboten. If we were inserting HTML, the browser would replace the illegal +// code points with alternatives in some cases - since we're bypassing that mechanism, we need +// to replace them ourselves +// +// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters +function validate_code(code: number) { + // line feed becomes generic whitespace + if (code === 10) { + return 32; + } + + // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) + if (code < 128) { + return code; + } + + // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need + // to correct the mistake or we'll end up with missing € signs and so on + if (code <= 159) { + return windows_1252[code - 128]; + } + + // basic multilingual plane + if (code < 55296) { + return code; + } + + // UTF-16 surrogate halves + if (code <= 57343) { + return NUL; + } + + // rest of the basic multilingual plane + if (code <= 65535) { + return code; + } + + // supplementary multilingual plane 0x10000 - 0x1ffff + if (code >= 65536 && code <= 131071) { + return code; + } + + // supplementary ideographic plane 0x20000 - 0x2ffff + if (code >= 131072 && code <= 196607) { + return code; + } + + return NUL; +} + +// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission +const disallowed_contents = new Map([ + ['li', new Set(['li'])], + ['dt', new Set(['dt', 'dd'])], + ['dd', new Set(['dt', 'dd'])], + ['p', new Set('address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(' '))], + ['rt', new Set(['rt', 'rp'])], + ['rp', new Set(['rt', 'rp'])], + ['optgroup', new Set(['optgroup'])], + ['option', new Set(['option', 'optgroup'])], + ['thead', new Set(['tbody', 'tfoot'])], + ['tbody', new Set(['tbody', 'tfoot'])], + ['tfoot', new Set(['tbody'])], + ['tr', new Set(['tr', 'tbody'])], + ['td', new Set(['td', 'th', 'tr'])], + ['th', new Set(['td', 'th', 'tr'])], +]); + +// can this be a child of the parent element, or does it implicitly +// close it, like `<li>one<li>two`? +export function closing_tag_omitted(current: string, next?: string) { + if (disallowed_contents.has(current)) { + if (!next || disallowed_contents.get(current).has(next)) { + return true; + } + } + + return false; +} |