diff options
author | 2021-09-20 22:37:22 -0700 | |
---|---|---|
committer | 2021-09-20 22:37:22 -0700 | |
commit | ab565f1a3ca2d9a320c958786fb3a9df3392ceae (patch) | |
tree | da5d9dbf6fd6b0371661ae8355e664fe909ec406 /misctools/gen-unicode-table.js | |
parent | 90cd3bf4cbe06b1795cd47bcae670d5970564abd (diff) | |
download | bun-ab565f1a3ca2d9a320c958786fb3a9df3392ceae.tar.gz bun-ab565f1a3ca2d9a320c958786fb3a9df3392ceae.tar.zst bun-ab565f1a3ca2d9a320c958786fb3a9df3392ceae.zip |
Fix parsing/printing unicode identifiers, switch to UTF-8 for prefilled strings, remove eagerly loading identifier_name
Diffstat (limited to 'misctools/gen-unicode-table.js')
-rw-r--r-- | misctools/gen-unicode-table.js | 138 |
1 files changed, 138 insertions, 0 deletions
diff --git a/misctools/gen-unicode-table.js b/misctools/gen-unicode-table.js new file mode 100644 index 000000000..6ee02f5e8 --- /dev/null +++ b/misctools/gen-unicode-table.js @@ -0,0 +1,138 @@ +// Thank you @evanw for this code!!! +const fs = require("fs"); +const path = require("path"); + +// ES5 reference: https://es5.github.io/ +// +// A conforming implementation of this International standard shall interpret +// characters in conformance with the Unicode Standard, Version 3.0 or later +// and ISO/IEC 10646-1 with either UCS-2 or UTF-16 as the adopted encoding +// form, implementation level 3. If the adopted ISO/IEC 10646-1 subset is not +// otherwise specified, it is presumed to be the BMP subset, collection 300. +// +// UnicodeLetter: any character in the Unicode categories “Uppercase letter (Lu)”, +// “Lowercase letter (Ll)”, “Titlecase letter (Lt)”, “Modifier letter (Lm)”, +// “Other letter (Lo)”, or “Letter number (Nl)”. +const idStartES5 = [] + .concat( + require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points") + + // The "letter number" category is not included because old versions of Safari + // had a bug where they didn't include it. This means it does not match ES5. + // We need to make sure we escape these characters so Safari can read them. + // See https://github.com/evanw/esbuild/issues/1349 for more information. + // require('@unicode/unicode-3.0.0/General_Category/Letter_Number/code-points'), + ) + .sort((a, b) => a - b); + +// UnicodeCombiningMark: any character in the Unicode categories “Non-spacing mark (Mn)” +// or “Combining spacing mark (Mc)” +// UnicodeDigit: any character in the Unicode category “Decimal number (Nd)” +// UnicodeConnectorPunctuation: any character in the Unicode category “Connector punctuation (Pc)” +const idContinueES5 = idStartES5 + .concat( + require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"), + require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points") + ) + .sort((a, b) => a - b); + +// ESNext reference: https://tc39.es/ecma262/ +// +// A conforming implementation of ECMAScript must interpret source text input +// in conformance with the Unicode Standard, Version 5.1.0 or later and ISO/IEC +// 10646. If the adopted ISO/IEC 10646-1 subset is not otherwise specified, it +// is presumed to be the Unicode set, collection 10646. +// +// UnicodeIDStart: any Unicode code point with the Unicode property “ID_Start” +const idStartESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Start/code-points"); +const idStartESNextSet = new Set(idStartESNext); + +// UnicodeIDContinue: any Unicode code point with the Unicode property “ID_Continue” +const idContinueESNext = require("@unicode/unicode-13.0.0/Binary_Property/ID_Continue/code-points"); +const idContinueESNextSet = new Set(idContinueESNext); + +// These identifiers are valid in both ES5 and ES6+ (i.e. an intersection of both) +const idStartES5AndESNext = idStartES5.filter((n) => idStartESNextSet.has(n)); +const idContinueES5AndESNext = idContinueES5.filter((n) => + idContinueESNextSet.has(n) +); + +// These identifiers are valid in either ES5 or ES6+ (i.e. a union of both) +const idStartES5OrESNext = [...new Set(idStartES5.concat(idStartESNext))].sort( + (a, b) => a - b +); +const idContinueES5OrESNext = [ + ...new Set(idContinueES5.concat(idContinueESNext)), +].sort((a, b) => a - b); + +function generateRangeTable(codePoints) { + let lines = []; + let index = 0; + let latinOffset = 0; + + while (latinOffset < codePoints.length && codePoints[latinOffset] <= 0xff) { + latinOffset++; + } + + lines.push(`RangeTable.init(`, ` ${latinOffset},`, ` &[_]R16Range{`); + + // 16-bit code points + while (index < codePoints.length && codePoints[index] < 0x1000) { + let start = codePoints[index]; + index++; + while ( + index < codePoints.length && + codePoints[index] < 0x1000 && + codePoints[index] === codePoints[index - 1] + 1 + ) { + index++; + } + let end = codePoints[index - 1]; + lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`); + } + + lines.push(` },`, `&[_]R32Range{`); + + // 32-bit code points + while (index < codePoints.length) { + let start = codePoints[index]; + index++; + while ( + index < codePoints.length && + codePoints[index] === codePoints[index - 1] + 1 + ) { + index++; + } + let end = codePoints[index - 1]; + lines.push(` .{0x${start.toString(16)}, 0x${end.toString(16)}},`); + } + + lines.push(` },`, `);`); + return lines.join("\n"); +} + +fs.writeFileSync( + path.join(__dirname, "..", "src", "js_lexer", "unicode.zig"), + `// This file was automatically generated by ${path.basename( + __filename + )}. Do not edit. + + const RangeTable = @import("./range_table.zig"); + + +// ES5 || ESNext +pub const id_start = ${generateRangeTable(idStartES5OrESNext)} + +// ES5 || ESNext +pub const id_continue = ${generateRangeTable(idContinueES5OrESNext)} + +pub const printable_id_start = ${generateRangeTable(idStartESNext)} +pub const printable_id_continue = ${generateRangeTable(idContinueESNext)} +` +); |