summaryrefslogtreecommitdiff
path: root/packages/astro-parser/src/parse/utils/html.ts
blob: e4669a2dbf014c0ab08af6e79d9703fc85cbdb24 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// @ts-nocheck

import entities from './entities.js';

const windows_1252 = [
  8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376,
];

const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g');

export function decode_character_references(html: string) {
  return html.replace(entity_pattern, (match, entity) => {
    let code;

    // Handle named entities
    if (entity[0] !== '#') {
      code = entities[entity];
    } else if (entity[1] === 'x') {
      code = parseInt(entity.substring(2), 16);
    } else {
      code = parseInt(entity.substring(1), 10);
    }

    if (!code) {
      return match;
    }

    return String.fromCodePoint(validate_code(code));
  });
}

const NUL = 0;

// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
// code points with alternatives in some cases - since we're bypassing that mechanism, we need
// to replace them ourselves
//
// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
function validate_code(code: number) {
  // line feed becomes generic whitespace
  if (code === 10) {
    return 32;
  }

  // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
  if (code < 128) {
    return code;
  }

  // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
  // to correct the mistake or we'll end up with missing € signs and so on
  if (code <= 159) {
    return windows_1252[code - 128];
  }

  // basic multilingual plane
  if (code < 55296) {
    return code;
  }

  // UTF-16 surrogate halves
  if (code <= 57343) {
    return NUL;
  }

  // rest of the basic multilingual plane
  if (code <= 65535) {
    return code;
  }

  // supplementary multilingual plane 0x10000 - 0x1ffff
  if (code >= 65536 && code <= 131071) {
    return code;
  }

  // supplementary ideographic plane 0x20000 - 0x2ffff
  if (code >= 131072 && code <= 196607) {
    return code;
  }

  return NUL;
}

// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
const disallowed_contents = new Map([
  ['li', new Set(['li'])],
  ['dt', new Set(['dt', 'dd'])],
  ['dd', new Set(['dt', 'dd'])],
  ['p', new Set('address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(' '))],
  ['rt', new Set(['rt', 'rp'])],
  ['rp', new Set(['rt', 'rp'])],
  ['optgroup', new Set(['optgroup'])],
  ['option', new Set(['option', 'optgroup'])],
  ['thead', new Set(['tbody', 'tfoot'])],
  ['tbody', new Set(['tbody', 'tfoot'])],
  ['tfoot', new Set(['tbody'])],
  ['tr', new Set(['tr', 'tbody'])],
  ['td', new Set(['td', 'th', 'tr'])],
  ['th', new Set(['td', 'th', 'tr'])],
]);

// can this be a child of the parent element, or does it implicitly
// close it, like `<li>one<li>two`?
export function closing_tag_omitted(current: string, next?: string) {
  if (disallowed_contents.has(current)) {
    if (!next || disallowed_contents.get(current).has(next)) {
      return true;
    }
  }

  return false;
}