diff options
| author | 2021-03-25 00:00:22 -0700 | |
|---|---|---|
| committer | 2021-03-25 00:00:22 -0700 | |
| commit | 30cccdf7154b6470e876464da9e412af10894dd5 (patch) | |
| tree | 73ed40b30af23ba3e5b94070e478f3e2ca1670c0 /src/parser/parse/utils/html.ts | |
| parent | a72ab10c623022860691d6a095b74dea70cc6f69 (diff) | |
| download | astro-30cccdf7154b6470e876464da9e412af10894dd5.tar.gz astro-30cccdf7154b6470e876464da9e412af10894dd5.tar.zst astro-30cccdf7154b6470e876464da9e412af10894dd5.zip | |
add component state, top-level await support (#26)
Diffstat (limited to 'src/parser/parse/utils/html.ts')
| -rw-r--r-- | src/parser/parse/utils/html.ts | 143 | 
1 files changed, 143 insertions, 0 deletions
| diff --git a/src/parser/parse/utils/html.ts b/src/parser/parse/utils/html.ts new file mode 100644 index 000000000..3b406c9cc --- /dev/null +++ b/src/parser/parse/utils/html.ts @@ -0,0 +1,143 @@ +// @ts-nocheck + +import entities from './entities.js'; + +const windows_1252 = [ +  8364, +  129, +  8218, +  402, +  8222, +  8230, +  8224, +  8225, +  710, +  8240, +  352, +  8249, +  338, +  141, +  381, +  143, +  144, +  8216, +  8217, +  8220, +  8221, +  8226, +  8211, +  8212, +  732, +  8482, +  353, +  8250, +  339, +  157, +  382, +  376, +]; + +const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g'); + +export function decode_character_references(html: string) { +  return html.replace(entity_pattern, (match, entity) => { +    let code; + +    // Handle named entities +    if (entity[0] !== '#') { +      code = entities[entity]; +    } else if (entity[1] === 'x') { +      code = parseInt(entity.substring(2), 16); +    } else { +      code = parseInt(entity.substring(1), 10); +    } + +    if (!code) { +      return match; +    } + +    return String.fromCodePoint(validate_code(code)); +  }); +} + +const NUL = 0; + +// some code points are verboten. If we were inserting HTML, the browser would replace the illegal +// code points with alternatives in some cases - since we're bypassing that mechanism, we need +// to replace them ourselves +// +// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters +function validate_code(code: number) { +  // line feed becomes generic whitespace +  if (code === 10) { +    return 32; +  } + +  // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) +  if (code < 128) { +    return code; +  } + +  // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need +  // to correct the mistake or we'll end up with missing € signs and so on +  if (code <= 159) { +    return windows_1252[code - 128]; +  } + +  // basic multilingual plane +  if (code < 55296) { +    return code; +  } + +  // UTF-16 surrogate halves +  if (code <= 57343) { +    return NUL; +  } + +  // rest of the basic multilingual plane +  if (code <= 65535) { +    return code; +  } + +  // supplementary multilingual plane 0x10000 - 0x1ffff +  if (code >= 65536 && code <= 131071) { +    return code; +  } + +  // supplementary ideographic plane 0x20000 - 0x2ffff +  if (code >= 131072 && code <= 196607) { +    return code; +  } + +  return NUL; +} + +// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission +const disallowed_contents = new Map([ +  ['li', new Set(['li'])], +  ['dt', new Set(['dt', 'dd'])], +  ['dd', new Set(['dt', 'dd'])], +  ['p', new Set('address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(' '))], +  ['rt', new Set(['rt', 'rp'])], +  ['rp', new Set(['rt', 'rp'])], +  ['optgroup', new Set(['optgroup'])], +  ['option', new Set(['option', 'optgroup'])], +  ['thead', new Set(['tbody', 'tfoot'])], +  ['tbody', new Set(['tbody', 'tfoot'])], +  ['tfoot', new Set(['tbody'])], +  ['tr', new Set(['tr', 'tbody'])], +  ['td', new Set(['td', 'th', 'tr'])], +  ['th', new Set(['td', 'th', 'tr'])], +]); + +// can this be a child of the parent element, or does it implicitly +// close it, like `<li>one<li>two`? +export function closing_tag_omitted(current: string, next?: string) { +  if (disallowed_contents.has(current)) { +    if (!next || disallowed_contents.get(current).has(next)) { +      return true; +    } +  } + +  return false; +} | 
