packages/astro-parser/src/parse/utils/html.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

// @ts-nocheck

import entities from './entities.js';

const windows_1252 = [
  8364,
  129,
  8218,
  402,
  8222,
  8230,
  8224,
  8225,
  710,
  8240,
  352,
  8249,
  338,
  141,
  381,
  143,
  144,
  8216,
  8217,
  8220,
  8221,
  8226,
  8211,
  8212,
  732,
  8482,
  353,
  8250,
  339,
  157,
  382,
  376,
];

const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g');

export function decode_character_references(html: string) {
  return html.replace(entity_pattern, (match, entity) => {
    let code;

    // Handle named entities
    if (entity[0] !== '#') {
      code = entities[entity];
    } else if (entity[1] === 'x') {
      code = parseInt(entity.substring(2), 16);
    } else {
      code = parseInt(entity.substring(1), 10);
    }

    if (!code) {
      return match;
    }

    return String.fromCodePoint(validate_code(code));
  });
}

const NUL = 0;

// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
// code points with alternatives in some cases - since we're bypassing that mechanism, we need
// to replace them ourselves
//
// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
function validate_code(code: number) {
  // line feed becomes generic whitespace
  if (code === 10) {
    return 32;
  }

  // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
  if (code < 128) {
    return code;
  }

  // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
  // to correct the mistake or we'll end up with missing € signs and so on
  if (code <= 159) {
    return windows_1252[code - 128];
  }

  // basic multilingual plane
  if (code < 55296) {
    return code;
  }

  // UTF-16 surrogate halves
  if (code <= 57343) {
    return NUL;
  }

  // rest of the basic multilingual plane
  if (code <= 65535) {
    return code;
  }

  // supplementary multilingual plane 0x10000 - 0x1ffff
  if (code >= 65536 && code <= 131071) {
    return code;
  }

  // supplementary ideographic plane 0x20000 - 0x2ffff
  if (code >= 131072 && code <= 196607) {
    return code;
  }

  return NUL;
}

// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
const disallowed_contents = new Map([
  ['li', new Set(['li'])],
  ['dt', new Set(['dt', 'dd'])],
  ['dd', new Set(['dt', 'dd'])],
  ['p', new Set('address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(' '))],
  ['rt', new Set(['rt', 'rp'])],
  ['rp', new Set(['rt', 'rp'])],
  ['optgroup', new Set(['optgroup'])],
  ['option', new Set(['option', 'optgroup'])],
  ['thead', new Set(['tbody', 'tfoot'])],
  ['tbody', new Set(['tbody', 'tfoot'])],
  ['tfoot', new Set(['tbody'])],
  ['tr', new Set(['tr', 'tbody'])],
  ['td', new Set(['td', 'th', 'tr'])],
  ['th', new Set(['td', 'th', 'tr'])],
]);

// can this be a child of the parent element, or does it implicitly
// close it, like `<li>one<li>two`?
export function closing_tag_omitted(current: string, next?: string) {
  if (disallowed_contents.has(current)) {
    if (!next || disallowed_contents.get(current).has(next)) {
      return true;
    }
  }

  return false;
}