diff options
author | 2021-03-19 17:07:45 -0400 | |
---|---|---|
committer | 2021-03-19 17:07:45 -0400 | |
commit | 17c3c98f07628b43b941b84831e8e1f9bcd7ca46 (patch) | |
tree | 2e2b3c7d6bd67ebaabe6636ae6867ad368ac6c3a /src/compiler/parse/utils/html.ts | |
parent | 8ebc077cb0d9f50aae22d2651bd5ef13fe4641d3 (diff) | |
download | astro-17c3c98f07628b43b941b84831e8e1f9bcd7ca46.tar.gz astro-17c3c98f07628b43b941b84831e8e1f9bcd7ca46.tar.zst astro-17c3c98f07628b43b941b84831e8e1f9bcd7ca46.zip |
Initial tests set up (#10)
* Begin debugging
* Initial tests set up
This adds tests using uvu (we can switch if people want) and restructures things a bit so that it's easier to test.
Like in snowpack you set up a little project. In our tests you can say:
```js
const result = await runtime.load('/blog/hello-world')
```
And analyze the result. I included a `test-helpers.js` which has a function that will turn HTML into a cheerio instance, for inspecting the result HTML.
* Add CI
* Remove extra console logs
* Formatting
Diffstat (limited to 'src/compiler/parse/utils/html.ts')
-rw-r--r-- | src/compiler/parse/utils/html.ts | 236 |
1 files changed, 113 insertions, 123 deletions
diff --git a/src/compiler/parse/utils/html.ts b/src/compiler/parse/utils/html.ts index d622be878..3b406c9cc 100644 --- a/src/compiler/parse/utils/html.ts +++ b/src/compiler/parse/utils/html.ts @@ -3,64 +3,61 @@ import entities from './entities.js'; const windows_1252 = [ - 8364, - 129, - 8218, - 402, - 8222, - 8230, - 8224, - 8225, - 710, - 8240, - 352, - 8249, - 338, - 141, - 381, - 143, - 144, - 8216, - 8217, - 8220, - 8221, - 8226, - 8211, - 8212, - 732, - 8482, - 353, - 8250, - 339, - 157, - 382, - 376 + 8364, + 129, + 8218, + 402, + 8222, + 8230, + 8224, + 8225, + 710, + 8240, + 352, + 8249, + 338, + 141, + 381, + 143, + 144, + 8216, + 8217, + 8220, + 8221, + 8226, + 8211, + 8212, + 732, + 8482, + 353, + 8250, + 339, + 157, + 382, + 376, ]; -const entity_pattern = new RegExp( - `&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, - 'g' -); +const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g'); export function decode_character_references(html: string) { - return html.replace(entity_pattern, (match, entity) => { - let code; - - // Handle named entities - if (entity[0] !== '#') { - code = entities[entity]; - } else if (entity[1] === 'x') { - code = parseInt(entity.substring(2), 16); - } else { - code = parseInt(entity.substring(1), 10); - } - - if (!code) { - return match; - } - - return String.fromCodePoint(validate_code(code)); - }); + return html.replace(entity_pattern, (match, entity) => { + let code; + + // Handle named entities + if (entity[0] !== '#') { + code = entities[entity]; + } else if (entity[1] === 'x') { + code = parseInt(entity.substring(2), 16); + } else { + code = parseInt(entity.substring(1), 10); + } + + if (!code) { + return match; + } + + return String.fromCodePoint(validate_code(code)); + }); } const NUL = 0; @@ -71,83 +68,76 @@ const NUL = 0; // // Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters function validate_code(code: number) { - // line feed becomes generic whitespace - if (code === 10) { - return 32; - } - - // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) - if (code < 128) { - return code; - } - - // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need - // to correct the mistake or we'll end up with missing € signs and so on - if (code <= 159) { - return windows_1252[code - 128]; - } - - // basic multilingual plane - if (code < 55296) { - return code; - } - - // UTF-16 surrogate halves - if (code <= 57343) { - return NUL; - } - - // rest of the basic multilingual plane - if (code <= 65535) { - return code; - } - - // supplementary multilingual plane 0x10000 - 0x1ffff - if (code >= 65536 && code <= 131071) { - return code; - } - - // supplementary ideographic plane 0x20000 - 0x2ffff - if (code >= 131072 && code <= 196607) { - return code; - } - - return NUL; + // line feed becomes generic whitespace + if (code === 10) { + return 32; + } + + // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) + if (code < 128) { + return code; + } + + // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need + // to correct the mistake or we'll end up with missing € signs and so on + if (code <= 159) { + return windows_1252[code - 128]; + } + + // basic multilingual plane + if (code < 55296) { + return code; + } + + // UTF-16 surrogate halves + if (code <= 57343) { + return NUL; + } + + // rest of the basic multilingual plane + if (code <= 65535) { + return code; + } + + // supplementary multilingual plane 0x10000 - 0x1ffff + if (code >= 65536 && code <= 131071) { + return code; + } + + // supplementary ideographic plane 0x20000 - 0x2ffff + if (code >= 131072 && code <= 196607) { + return code; + } + + return NUL; } // based on http://developers.whatwg.org/syntax.html#syntax-tag-omission const disallowed_contents = new Map([ - ['li', new Set(['li'])], - ['dt', new Set(['dt', 'dd'])], - ['dd', new Set(['dt', 'dd'])], - [ - 'p', - new Set( - 'address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split( - ' ' - ) - ) - ], - ['rt', new Set(['rt', 'rp'])], - ['rp', new Set(['rt', 'rp'])], - ['optgroup', new Set(['optgroup'])], - ['option', new Set(['option', 'optgroup'])], - ['thead', new Set(['tbody', 'tfoot'])], - ['tbody', new Set(['tbody', 'tfoot'])], - ['tfoot', new Set(['tbody'])], - ['tr', new Set(['tr', 'tbody'])], - ['td', new Set(['td', 'th', 'tr'])], - ['th', new Set(['td', 'th', 'tr'])] + ['li', new Set(['li'])], + ['dt', new Set(['dt', 'dd'])], + ['dd', new Set(['dt', 'dd'])], + ['p', new Set('address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(' '))], + ['rt', new Set(['rt', 'rp'])], + ['rp', new Set(['rt', 'rp'])], + ['optgroup', new Set(['optgroup'])], + ['option', new Set(['option', 'optgroup'])], + ['thead', new Set(['tbody', 'tfoot'])], + ['tbody', new Set(['tbody', 'tfoot'])], + ['tfoot', new Set(['tbody'])], + ['tr', new Set(['tr', 'tbody'])], + ['td', new Set(['td', 'th', 'tr'])], + ['th', new Set(['td', 'th', 'tr'])], ]); // can this be a child of the parent element, or does it implicitly // close it, like `<li>one<li>two`? export function closing_tag_omitted(current: string, next?: string) { - if (disallowed_contents.has(current)) { - if (!next || disallowed_contents.get(current).has(next)) { - return true; - } - } + if (disallowed_contents.has(current)) { + if (!next || disallowed_contents.get(current).has(next)) { + return true; + } + } - return false; + return false; } |