From 04ecfdd794795a3afe9abf836540b1f664276e48 Mon Sep 17 00:00:00 2001 From: Jarred Sumner Date: Thu, 4 Nov 2021 18:33:00 -0700 Subject: [JS Parser] Print � for invalid WTF-8 input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/js_lexer.zig | 334 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 169 insertions(+), 165 deletions(-) (limited to 'src') diff --git a/src/js_lexer.zig b/src/js_lexer.zig index d378ea670..401ce282c 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -565,20 +565,20 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { stringLiteral: while (true) { switch (lexer.code_point) { '\\' => { - try lexer.step(); + lexer.step(); // Handle Windows CRLF if (lexer.code_point == 'r' and comptime !is_json) { - try lexer.step(); + lexer.step(); if (lexer.code_point == '\n') { - try lexer.step(); + lexer.step(); } continue :stringLiteral; } if (comptime is_json and json_options.ignore_trailing_escape_sequences) { if (lexer.code_point == quote and lexer.current >= lexer.source.contents.len) { - try lexer.step(); + lexer.step(); break; } @@ -587,7 +587,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { switch (lexer.code_point) { // 0 cannot be in this list because it may be a legacy octal literal 'v', 'f', 't', 'r', 'n', '`', '\'', '"', 0x2028, 0x2029 => { - try lexer.step(); + lexer.step(); continue :stringLiteral; }, else => { @@ -631,10 +631,10 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '$' => { if (comptime quote == '`') { - try lexer.step(); + lexer.step(); if (lexer.code_point == '{') { suffix_len = 2; - try lexer.step(); + lexer.step(); if (lexer.rescan_close_brace_as_template_token) { lexer.token = T.t_template_middle; } else { @@ -647,7 +647,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { }, // exit condition quote => { - try lexer.step(); + lexer.step(); break; }, @@ -662,7 +662,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } }, } - try lexer.step(); + lexer.step(); } return InnerStringLiteral{ .needs_slow_path = needs_slow_path, .suffix_len = suffix_len }; @@ -678,7 +678,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } // quote is 0 when parsing JSON from .env // .env values may not always be quoted. - try lexer.step(); + lexer.step(); var string_literal_details = try lexer.parseStringLiteralInnter(quote); @@ -712,28 +712,32 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } inline fn nextCodepointSlice(it: *LexerType) []const u8 { - const cp_len = strings.utf8ByteSequenceLength(it.source.contents.ptr[it.current]); - it.end = it.current; - it.current += cp_len; - - return if (!(it.current > it.source.contents.len)) it.source.contents[it.current - cp_len .. it.current] else ""; + const cp_len = strings.wtf8ByteSequenceLength(it.source.contents.ptr[it.current]); + return if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else ""; } - inline fn nextCodepoint(it: *LexerType) !CodePoint { - const slice = it.nextCodepointSlice(); + inline fn nextCodepoint(it: *LexerType) CodePoint { + const cp_len = strings.wtf8ByteSequenceLength(it.source.contents.ptr[it.current]); + const slice = if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else ""; - return switch (slice.len) { + const code_point = switch (slice.len) { 0 => -1, 1 => @as(CodePoint, slice[0]), - 2 => @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable), - 3 => @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable), - 4 => @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable), - else => unreachable, + else => strings.decodeWTF8RuneTMultibyte(slice.ptr[0..4], @intCast(u3, slice.len), CodePoint, strings.unicode_replacement), }; + + it.end = it.current; + + it.current += if (code_point != strings.unicode_replacement) + cp_len + else + 1; + + return code_point; } - inline fn step(lexer: *LexerType) !void { - lexer.code_point = try lexer.nextCodepoint(); + inline fn step(lexer: *LexerType) void { + lexer.code_point = lexer.nextCodepoint(); // Track the approximate number of newlines in the file so we can preallocate // the line offset table in the printer for source maps. The line offset table @@ -799,19 +803,19 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { // Scan a unicode escape sequence. There is at least one because that's // what caused us to get on this slow path in the first place. if (lexer.code_point == '\\') { - try lexer.step(); + lexer.step(); if (lexer.code_point != 'u') { try lexer.syntaxError(); } - try lexer.step(); + lexer.step(); if (lexer.code_point == '{') { // Variable-length - try lexer.step(); + lexer.step(); while (lexer.code_point != '}') { switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { - try lexer.step(); + lexer.step(); }, else => { try lexer.syntaxError(); @@ -819,14 +823,14 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } } - try lexer.step(); + lexer.step(); } else { // Fixed-length // comptime var j: usize = 0; switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { - try lexer.step(); + lexer.step(); }, else => { try lexer.syntaxError(); @@ -834,7 +838,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { - try lexer.step(); + lexer.step(); }, else => { try lexer.syntaxError(); @@ -842,7 +846,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { - try lexer.step(); + lexer.step(); }, else => { try lexer.syntaxError(); @@ -850,7 +854,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { - try lexer.step(); + lexer.step(); }, else => { try lexer.syntaxError(); @@ -863,7 +867,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { if (!isIdentifierContinue(lexer.code_point)) { break; } - try lexer.step(); + lexer.step(); } // Second pass: re-use our existing escape sequence parser @@ -937,17 +941,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '>' => { // "=" + ">" = "=>" lexer.token = .t_equals_greater_than; - try lexer.step(); + lexer.step(); }, '=' => { // "=" + "=" = "==" lexer.token = .t_equals_equals; - try lexer.step(); + lexer.step(); if (lexer.code_point == '=') { // "=" + "==" = "===" lexer.token = .t_equals_equals_equals; - try lexer.step(); + lexer.step(); } }, else => {}, @@ -1038,7 +1042,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { // "#!/usr/bin/env node" lexer.token = .t_hashbang; hashbang: while (true) { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :hashbang; @@ -1051,7 +1055,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } lexer.identifier = lexer.raw(); } else { - try lexer.step(); + lexer.step(); if (lexer.code_point == '\\') { lexer.identifier = (try lexer.scanIdentifierWithEscapes(.private)).contents; lexer.token = T.t_private_identifier; @@ -1060,9 +1064,9 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { try lexer.syntaxError(); } - try lexer.step(); + lexer.step(); while (isIdentifierContinue(lexer.code_point)) { - try lexer.step(); + lexer.step(); } if (lexer.code_point == '\\') { lexer.identifier = (try lexer.scanIdentifierWithEscapes(.private)).contents; @@ -1076,67 +1080,67 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } }, '\r', '\n', 0x2028, 0x2029 => { - try lexer.step(); + lexer.step(); lexer.has_newline_before = true; continue; }, '\t', ' ' => { - try lexer.step(); + lexer.step(); continue; }, '(' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_open_paren; }, ')' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_close_paren; }, '[' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_open_bracket; }, ']' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_close_bracket; }, '{' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_open_brace; }, '}' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_close_brace; }, ',' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_comma; }, ':' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_colon; }, ';' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_semicolon; }, '@' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_at; }, '~' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_tilde; }, '?' => { // '?' or '?.' or '??' or '??=' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '?' => { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_question_question_equals; }, else => { @@ -1154,7 +1158,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { if (current < contents.len) { const c = contents[current]; if (c < '0' or c > '9') { - try lexer.step(); + lexer.step(); lexer.token = T.t_question_dot; } } @@ -1166,10 +1170,10 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { }, '%' => { // '%' or '%=' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_percent_equals; }, @@ -1181,18 +1185,18 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '&' => { // '&' or '&=' or '&&' or '&&=' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_ampersand_equals; }, '&' => { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_ampersand_ampersand_equals; }, @@ -1210,17 +1214,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '|' => { // '|' or '|=' or '||' or '||=' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_bar_equals; }, '|' => { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_bar_bar_equals; }, @@ -1237,10 +1241,10 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '^' => { // '^' or '^=' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_caret_equals; }, @@ -1252,15 +1256,15 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '+' => { // '+' or '+=' or '++' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_plus_equals; }, '+' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_plus_plus; }, @@ -1272,18 +1276,18 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '-' => { // '+' or '+=' or '++' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_minus_equals; }, '-' => { - try lexer.step(); + lexer.step(); if (lexer.code_point == '>' and lexer.has_newline_before) { - try lexer.step(); + lexer.step(); lexer.log.addRangeWarning(lexer.source, lexer.range(), "Treating \"-->\" as the start of a legacy HTML single-line comment") catch unreachable; singleLineHTMLCloseComment: while (true) { @@ -1296,7 +1300,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { }, else => {}, } - try lexer.step(); + lexer.step(); } continue; } @@ -1313,17 +1317,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '*' => { // '*' or '*=' or '**' or '**=' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = .t_asterisk_equals; }, '*' => { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = .t_asterisk_asterisk_equals; }, else => { @@ -1338,16 +1342,16 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { }, '/' => { // '/' or '/=' or '//' or '/* ... */' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = .t_slash_equals; }, '/' => { singleLineComment: while (true) { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :singleLineComment; @@ -1369,19 +1373,19 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { continue; }, '*' => { - try lexer.step(); + lexer.step(); multiLineComment: while (true) { switch (lexer.code_point) { '*' => { - try lexer.step(); + lexer.step(); if (lexer.code_point == '/') { - try lexer.step(); + lexer.step(); break :multiLineComment; } }, '\r', '\n', 0x2028, 0x2029 => { - try lexer.step(); + lexer.step(); lexer.has_newline_before = true; }, -1 => { @@ -1393,7 +1397,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { ); }, else => { - try lexer.step(); + lexer.step(); }, } } @@ -1414,18 +1418,18 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '=' => { // '=' or '=>' or '==' or '===' - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '>' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_equals_greater_than; }, '=' => { - try lexer.step(); + lexer.step(); switch (lexer.code_point) { '=' => { - try lexer.step(); + lexer.step(); lexer.token = T.t_equals_equals_equals; }, @@ -1443,18 +1447,18 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '<' => { // '<' or '<<' or '<=' or '<<=' or '