diff options
author | 2021-04-25 00:56:48 -0700 | |
---|---|---|
committer | 2021-04-25 00:56:48 -0700 | |
commit | c0b7f71b9ace0bfd91fb5e0364f104b5093e6c37 (patch) | |
tree | 7da69736690eb11b974800ad947d986938e3bb50 /src/js_lexer.zig | |
parent | 322b5cbd382c9a59deb20dd307dd6d09bd844482 (diff) | |
download | bun-c0b7f71b9ace0bfd91fb5e0364f104b5093e6c37.tar.gz bun-c0b7f71b9ace0bfd91fb5e0364f104b5093e6c37.tar.zst bun-c0b7f71b9ace0bfd91fb5e0364f104b5093e6c37.zip |
little defines, little readme, json parser
Diffstat (limited to 'src/js_lexer.zig')
-rw-r--r-- | src/js_lexer.zig | 2202 |
1 files changed, 1192 insertions, 1010 deletions
diff --git a/src/js_lexer.zig b/src/js_lexer.zig index a16c72c19..fa95341a5 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -22,1123 +22,1264 @@ pub const StrictModeReservedWords = tables.StrictModeReservedWords; pub const PropertyModifierKeyword = tables.PropertyModifierKeyword; pub const TypescriptStmtKeyword = tables.TypescriptStmtKeyword; -// TODO: JSON -const IS_JSON_FILE = false; - -pub const Lexer = struct { - // pub const Error = error{ - // UnexpectedToken, - // EndOfFile, - // }; - - // err: ?Lexer.Error, - log: logger.Log, - source: logger.Source, - current: usize = 0, - start: usize = 0, - end: usize = 0, - approximate_newline_count: i32 = 0, - legacy_octal_loc: logger.Loc = logger.Loc.Empty, - previous_backslash_quote_in_jsx: logger.Range = logger.Range.None, - token: T = T.t_end_of_file, - has_newline_before: bool = false, - has_pure_comment_before: bool = false, - preserve_all_comments_before: bool = false, - is_legacy_octal_literal: bool = false, - comments_to_preserve_before: ?[]js_ast.G.Comment = null, - all_original_comments: ?[]js_ast.G.Comment = null, - code_point: CodePoint = -1, - string_literal: JavascriptString, - identifier: []const u8 = "", - jsx_factory_pragma_comment: ?js_ast.Span = null, - jsx_fragment_pragma_comment: ?js_ast.Span = null, - source_mapping_url: ?js_ast.Span = null, - number: f64 = 0.0, - rescan_close_brace_as_template_token: bool = false, - for_global_name: bool = false, - prev_error_loc: logger.Loc = logger.Loc.Empty, - allocator: *std.mem.Allocator, - - pub fn loc(self: *Lexer) logger.Loc { - return logger.usize2Loc(self.start); - } +pub const JSONOptions = struct { + allow_comments: bool = false, + allow_trailing_commas: bool = false, +}; - fn nextCodepointSlice(it: *Lexer) callconv(.Inline) ?[]const u8 { - if (it.current >= it.source.contents.len) { - return null; +pub fn NewLexerType(comptime jsonOptions: ?JSONOptions) type { + return struct { + // pub const Error = error{ + // UnexpectedToken, + // EndOfFile, + // }; + + // err: ?@This().Error, + log: logger.Log, + source: logger.Source, + current: usize = 0, + start: usize = 0, + end: usize = 0, + approximate_newline_count: i32 = 0, + legacy_octal_loc: logger.Loc = logger.Loc.Empty, + previous_backslash_quote_in_jsx: logger.Range = logger.Range.None, + token: T = T.t_end_of_file, + has_newline_before: bool = false, + has_pure_comment_before: bool = false, + preserve_all_comments_before: bool = false, + is_legacy_octal_literal: bool = false, + comments_to_preserve_before: std.ArrayList(js_ast.G.Comment), + all_original_comments: ?[]js_ast.G.Comment = null, + code_point: CodePoint = -1, + string_literal: JavascriptString, + identifier: []const u8 = "", + jsx_factory_pragma_comment: ?js_ast.Span = null, + jsx_fragment_pragma_comment: ?js_ast.Span = null, + source_mapping_url: ?js_ast.Span = null, + number: f64 = 0.0, + rescan_close_brace_as_template_token: bool = false, + for_global_name: bool = false, + prev_error_loc: logger.Loc = logger.Loc.Empty, + allocator: *std.mem.Allocator, + + pub fn loc(self: *@This()) logger.Loc { + return logger.usize2Loc(self.start); } - const cp_len = unicode.utf8ByteSequenceLength(it.source.contents[it.current]) catch unreachable; - it.end = it.current; - it.current += cp_len; - - return it.source.contents[it.current - cp_len .. it.current]; - } - - pub fn syntaxError(self: *Lexer) void { - self.addError(self.start, "Syntax Error!!", .{}, true); - } + fn nextCodepointSlice(it: *@This()) callconv(.Inline) ?[]const u8 { + if (it.current >= it.source.contents.len) { + return null; + } - pub fn addDefaultError(self: *Lexer, msg: []const u8) void { - self.addError(self.start, "{s}", .{msg}, true); - } + const cp_len = unicode.utf8ByteSequenceLength(it.source.contents[it.current]) catch unreachable; + it.end = it.current; + it.current += cp_len; - pub fn addError(self: *Lexer, _loc: usize, comptime format: []const u8, args: anytype, panic: bool) void { - var __loc = logger.usize2Loc(_loc); - if (__loc.eql(self.prev_error_loc)) { - return; + return it.source.contents[it.current - cp_len .. it.current]; } - const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; - self.log.addError(self.source, __loc, errorMessage) catch unreachable; - self.prev_error_loc = __loc; - var msg = self.log.msgs.items[self.log.msgs.items.len - 1]; - msg.formatNoWriter(std.debug.panic); - } + pub fn syntaxError(self: *@This()) void { + self.addError(self.start, "Syntax Error!!", .{}, true); + } - pub fn addRangeError(self: *Lexer, r: logger.Range, comptime format: []const u8, args: anytype, panic: bool) void { - if (self.prev_error_loc.eql(r.loc)) { - return; + pub fn addDefaultError(self: *@This(), msg: []const u8) void { + self.addError(self.start, "{s}", .{msg}, true); } - const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; - var msg = self.log.addRangeError(self.source, r, errorMessage); - self.prev_error_loc = r.loc; + pub fn addError(self: *@This(), _loc: usize, comptime format: []const u8, args: anytype, panic: bool) void { + var __loc = logger.usize2Loc(_loc); + if (__loc.eql(self.prev_error_loc)) { + return; + } - if (panic) { - self.doPanic(errorMessage); + const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; + self.log.addError(self.source, __loc, errorMessage) catch unreachable; + self.prev_error_loc = __loc; + var msg = self.log.msgs.items[self.log.msgs.items.len - 1]; + msg.formatNoWriter(std.debug.panic); } - } - fn doPanic(self: *Lexer, content: []const u8) void { - std.debug.panic("{s}", .{content}); - } + pub fn addRangeError(self: *@This(), r: logger.Range, comptime format: []const u8, args: anytype, panic: bool) void { + if (self.prev_error_loc.eql(r.loc)) { + return; + } - pub fn codePointEql(self: *Lexer, a: u8) bool { - return @intCast(CodePoint, a) == self.code_point; - } + const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; + var msg = self.log.addRangeError(self.source, r, errorMessage); + self.prev_error_loc = r.loc; - fn nextCodepoint(it: *Lexer) callconv(.Inline) CodePoint { - const slice = it.nextCodepointSlice() orelse return @as(CodePoint, -1); + if (panic) { + self.doPanic(errorMessage); + } + } - switch (slice.len) { - 1 => return @as(CodePoint, slice[0]), - 2 => return @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable), - 3 => return @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable), - 4 => return @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable), - else => unreachable, + fn doPanic(self: *@This(), content: []const u8) void { + std.debug.panic("{s}", .{content}); } - } - /// Look ahead at the next n codepoints without advancing the iterator. - /// If fewer than n codepoints are available, then return the remainder of the string. - fn peek(it: *Lexer, n: usize) []const u8 { - const original_i = it.current; - defer it.current = original_i; - - var end_ix = original_i; - var found: usize = 0; - while (found < n) : (found += 1) { - const next_codepoint = it.nextCodepointSlice() orelse return it.source.contents[original_i..]; - end_ix += next_codepoint.len; + pub fn codePointEql(self: *@This(), a: u8) bool { + return @intCast(CodePoint, a) == self.code_point; } - return it.source.contents[original_i..end_ix]; - } + fn nextCodepoint(it: *@This()) callconv(.Inline) CodePoint { + const slice = it.nextCodepointSlice() orelse return @as(CodePoint, -1); - pub fn isIdentifierOrKeyword(lexer: Lexer) bool { - return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier); - } + switch (slice.len) { + 1 => return @as(CodePoint, slice[0]), + 2 => return @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable), + 3 => return @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable), + 4 => return @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable), + else => unreachable, + } + } - fn parseStringLiteral(lexer: *Lexer) void { - var quote: CodePoint = lexer.code_point; - var needs_slow_path = false; - var suffixLen: usize = 1; - - if (quote != '`') { - lexer.token = T.t_string_literal; - } else if (lexer.rescan_close_brace_as_template_token) { - lexer.token = T.t_template_tail; - } else { - lexer.token = T.t_no_substitution_template_literal; + /// Look ahead at the next n codepoints without advancing the iterator. + /// If fewer than n codepoints are available, then return the remainder of the string. + fn peek(it: *@This(), n: usize) []const u8 { + const original_i = it.current; + defer it.current = original_i; + + var end_ix = original_i; + var found: usize = 0; + while (found < n) : (found += 1) { + const next_codepoint = it.nextCodepointSlice() orelse return it.source.contents[original_i..]; + end_ix += next_codepoint.len; + } + + return it.source.contents[original_i..end_ix]; } - lexer.step(); - stringLiteral: while (true) { - switch (lexer.code_point) { - '\\' => { - needs_slow_path = true; - lexer.step(); + pub fn isIdentifierOrKeyword(lexer: @This()) bool { + return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier); + } - // Handle Windows CRLF - if (lexer.code_point == '\r' and !IS_JSON_FILE) { + fn parseStringLiteral(lexer: *@This()) void { + var quote: CodePoint = lexer.code_point; + var needs_slow_path = false; + var suffixLen: usize = 1; + + if (quote != '`') { + lexer.token = T.t_string_literal; + } else if (lexer.rescan_close_brace_as_template_token) { + lexer.token = T.t_template_tail; + } else { + lexer.token = T.t_no_substitution_template_literal; + } + lexer.step(); + + stringLiteral: while (true) { + switch (lexer.code_point) { + '\\' => { + needs_slow_path = true; lexer.step(); - if (lexer.code_point == '\n') { + + // Handle Windows CRLF + if (lexer.code_point == '\r' and jsonOptions != null) { lexer.step(); + if (lexer.code_point == '\n') { + lexer.step(); + } + continue :stringLiteral; } - continue :stringLiteral; - } - }, - // This indicates the end of the file - - -1 => { - lexer.addDefaultError("Unterminated string literal"); - }, + }, + // This indicates the end of the file - '\r' => { - if (quote != '`') { + -1 => { lexer.addDefaultError("Unterminated string literal"); - } + }, - // Template literals require newline normalization - needs_slow_path = true; - }, + '\r' => { + if (quote != '`') { + lexer.addDefaultError("Unterminated string literal"); + } - '\n' => { - if (quote != '`') { - lexer.addDefaultError("Unterminated string literal"); - } - }, + // Template literals require newline normalization + needs_slow_path = true; + }, - '$' => { - if (quote == '`') { - lexer.step(); - if (lexer.code_point == '{') { - suffixLen = 2; + '\n' => { + if (quote != '`') { + lexer.addDefaultError("Unterminated string literal"); + } + }, + + '$' => { + if (quote == '`') { lexer.step(); - if (lexer.rescan_close_brace_as_template_token) { - lexer.token = T.t_template_middle; - } else { - lexer.token = T.t_template_head; + if (lexer.code_point == '{') { + suffixLen = 2; + lexer.step(); + if (lexer.rescan_close_brace_as_template_token) { + lexer.token = T.t_template_middle; + } else { + lexer.token = T.t_template_head; + } + break :stringLiteral; } - break :stringLiteral; + continue :stringLiteral; } - continue :stringLiteral; - } - }, + }, - else => { - if (quote == lexer.code_point) { - lexer.step(); - break :stringLiteral; - } - // Non-ASCII strings need the slow path - if (lexer.code_point >= 0x80) { - needs_slow_path = true; - } else if (IS_JSON_FILE and lexer.code_point < 0x20) { - lexer.syntaxError(); - } - }, + else => { + if (quote == lexer.code_point) { + lexer.step(); + break :stringLiteral; + } + // Non-ASCII strings need the slow path + if (lexer.code_point >= 0x80) { + needs_slow_path = true; + } else if (jsonOptions != null and lexer.code_point < 0x20) { + lexer.syntaxError(); + } + }, + } + lexer.step(); } - lexer.step(); - } - const text = lexer.source.contents[lexer.start + 1 .. lexer.end - suffixLen]; - // TODO: actually implement proper utf16 - lexer.string_literal = lexer.allocator.alloc(u16, text.len) catch unreachable; - var i: usize = 0; - for (text) |byte| { - lexer.string_literal[i] = byte; - i += 1; - } - // for (text) - // // if (needs_slow_path) { - // // // Slow path + const text = lexer.source.contents[lexer.start + 1 .. lexer.end - suffixLen]; + if (needs_slow_path) { + lexer.string_literal = lexer.stringToUTF16(text); + } else { + lexer.string_literal = lexer.allocator.alloc(u16, text.len) catch unreachable; + var i: usize = 0; + for (text) |byte| { + lexer.string_literal[i] = byte; + i += 1; + } + } - // // // lexer.string_literal = lexer.(lexer.start + 1, text); - // // } else { - // // // Fast path + if (quote == '\'' and jsonOptions != null) { + lexer.addRangeError(lexer.range(), "JSON strings must use double quotes", .{}, true); + } + // for (text) + // // if (needs_slow_path) { + // // // Slow path - // // } - } + // // // lexer.string_literal = lexer.(lexer.start + 1, text); + // // } else { + // // // Fast path - fn step(lexer: *Lexer) void { - lexer.code_point = lexer.nextCodepoint(); - - // Track the approximate number of newlines in the file so we can preallocate - // the line offset table in the printer for source maps. The line offset table - // is the #1 highest allocation in the heap profile, so this is worth doing. - // This count is approximate because it handles "\n" and "\r\n" (the common - // cases) but not "\r" or "\u2028" or "\u2029". Getting this wrong is harmless - // because it's only a preallocation. The array will just grow if it's too small. - if (lexer.code_point == '\n') { - lexer.approximate_newline_count += 1; + // // } } - } - pub fn expect(self: *Lexer, comptime token: T) void { - if (self.token != token) { - self.expected(token); + fn step(lexer: *@This()) void { + lexer.code_point = lexer.nextCodepoint(); + + // Track the approximate number of newlines in the file so we can preallocate + // the line offset table in the printer for source maps. The line offset table + // is the #1 highest allocation in the heap profile, so this is worth doing. + // This count is approximate because it handles "\n" and "\r\n" (the common + // cases) but not "\r" or "\u2028" or "\u2029". Getting this wrong is harmless + // because it's only a preallocation. The array will just grow if it's too small. + if (lexer.code_point == '\n') { + lexer.approximate_newline_count += 1; + } } - self.next(); - } + pub fn expect(self: *@This(), comptime token: T) void { + if (self.token != token) { + self.expected(token); + } - pub fn expectOrInsertSemicolon(lexer: *Lexer) void { - if (lexer.token == T.t_semicolon or (!lexer.has_newline_before and - lexer.token != T.t_close_brace and lexer.token != T.t_end_of_file)) - { - lexer.expect(T.t_semicolon); + self.next(); } - } - pub fn addUnsupportedSyntaxError(self: *Lexer, msg: []const u8) void { - self.addError(self.end, "Unsupported syntax: {s}", .{msg}, true); - } + pub fn expectOrInsertSemicolon(lexer: *@This()) void { + if (lexer.token == T.t_semicolon or (!lexer.has_newline_before and + lexer.token != T.t_close_brace and lexer.token != T.t_end_of_file)) + { + lexer.expect(T.t_semicolon); + } + } - pub fn scanIdentifierWithEscapes(self: *Lexer) void { - self.addUnsupportedSyntaxError("escape sequence"); - return; - } + pub fn addUnsupportedSyntaxError(self: *@This(), msg: []const u8) void { + self.addError(self.end, "Unsupported syntax: {s}", .{msg}, true); + } + + pub fn scanIdentifierWithEscapes(self: *@This()) void { + self.addUnsupportedSyntaxError("escape sequence"); + return; + } - pub fn debugInfo(self: *Lexer) void { - if (self.log.errors > 0) { - const stderr = std.io.getStdErr().writer(); - self.log.print(stderr) catch unreachable; - } else { - if (self.token == T.t_identifier or self.token == T.t_string_literal) { - std.debug.print(" {s} ", .{self.raw()}); + pub fn debugInfo(self: *@This()) void { + if (self.log.errors > 0) { + const stderr = std.io.getStdErr().writer(); + self.log.print(stderr) catch unreachable; } else { - std.debug.print(" <{s}> ", .{tokenToString.get(self.token)}); + if (self.token == T.t_identifier or self.token == T.t_string_literal) { + std.debug.print(" {s} ", .{self.raw()}); + } else { + std.debug.print(" <{s}> ", .{tokenToString.get(self.token)}); + } } } - } - pub fn expectContextualKeyword(self: *Lexer, comptime keyword: string) void { - if (!self.isContextualKeyword(keyword)) { - self.addError(self.start, "\"{s}\"", .{keyword}, true); + pub fn expectContextualKeyword(self: *@This(), comptime keyword: string) void { + if (!self.isContextualKeyword(keyword)) { + self.addError(self.start, "\"{s}\"", .{keyword}, true); + } + self.next(); } - self.next(); - } - - pub fn next(lexer: *Lexer) void { - lexer.has_newline_before = lexer.end == 0; - lex: while (lexer.log.errors == 0) { - lexer.start = lexer.end; - lexer.token = T.t_end_of_file; + pub fn next(lexer: *@This()) void { + lexer.has_newline_before = lexer.end == 0; - switch (lexer.code_point) { - -1 => { - lexer.token = T.t_end_of_file; - break :lex; - }, + lex: while (lexer.log.errors == 0) { + lexer.start = lexer.end; + lexer.token = T.t_end_of_file; - '#' => { - if (lexer.start == 0 and lexer.source.contents[1] == '!') { - lexer.addUnsupportedSyntaxError("#!hashbang is not supported yet."); - return; - } + switch (lexer.code_point) { + -1 => { + lexer.token = T.t_end_of_file; + break :lex; + }, - lexer.step(); - if (!isIdentifierStart(lexer.code_point)) { - lexer.syntaxError(); - } - lexer.step(); + '#' => { + if (lexer.start == 0 and lexer.source.contents[1] == '!') { + lexer.addUnsupportedSyntaxError("#!hashbang is not supported yet."); + return; + } - if (isIdentifierStart(lexer.code_point)) { lexer.step(); - while (isIdentifierContinue(lexer.code_point)) { - lexer.step(); - } - if (lexer.code_point == '\\') { - lexer.scanIdentifierWithEscapes(); - lexer.token = T.t_private_identifier; - // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier); - } else { - lexer.token = T.t_private_identifier; - lexer.identifier = lexer.raw(); + if (!isIdentifierStart(lexer.code_point)) { + lexer.syntaxError(); } - break; - } - }, - '\r', '\n', 0x2028, 0x2029 => { - lexer.step(); - lexer.has_newline_before = true; - continue; - }, - - '\t', ' ' => { - lexer.step(); - continue; - }, - - '(' => { - lexer.step(); - lexer.token = T.t_open_paren; - }, - ')' => { - lexer.step(); - lexer.token = T.t_close_paren; - }, - '[' => { - lexer.step(); - lexer.token = T.t_open_bracket; - }, - ']' => { - lexer.step(); - lexer.token = T.t_close_bracket; - }, - '{' => { - lexer.step(); - lexer.token = T.t_open_brace; - }, - '}' => { - lexer.step(); - lexer.token = T.t_close_brace; - }, - ',' => { - lexer.step(); - lexer.token = T.t_comma; - }, - ':' => { - lexer.step(); - lexer.token = T.t_colon; - }, - ';' => { - lexer.step(); - lexer.token = T.t_semicolon; - }, - '@' => { - lexer.step(); - lexer.token = T.t_at; - }, - '~' => { - lexer.step(); - lexer.token = T.t_tilde; - }, + lexer.step(); - '?' => { - // '?' or '?.' or '??' or '??=' - lexer.step(); - switch (lexer.code_point) { - '?' => { + if (isIdentifierStart(lexer.code_point)) { lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_question_question_equals; - }, - else => { - lexer.token = T.t_question_question; - }, + while (isIdentifierContinue(lexer.code_point)) { + lexer.step(); } - }, - - '.' => { - lexer.token = T.t_question; - const current = lexer.current; - const contents = lexer.source.contents; - - // Lookahead to disambiguate with 'a?.1:b' - if (current < contents.len) { - const c = contents[current]; - if (c < '0' or c > '9') { - lexer.step(); - lexer.token = T.t_question_dot; - } + if (lexer.code_point == '\\') { + lexer.scanIdentifierWithEscapes(); + lexer.token = T.t_private_identifier; + // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier); + } else { + lexer.token = T.t_private_identifier; + lexer.identifier = lexer.raw(); } - }, - else => { - lexer.token = T.t_question; - }, - } - }, - - '%' => { - // '%' or '%=' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_percent_equals; - }, + break; + } + }, + '\r', '\n', 0x2028, 0x2029 => { + lexer.step(); + lexer.has_newline_before = true; + continue; + }, + '\t', ' ' => { + lexer.step(); + continue; + }, + '(' => { + lexer.step(); + lexer.token = T.t_open_paren; + }, + ')' => { + lexer.step(); + lexer.token = T.t_close_paren; + }, + '[' => { + lexer.step(); + lexer.token = T.t_open_bracket; + }, + ']' => { + lexer.step(); + lexer.token = T.t_close_bracket; + }, + '{' => { + lexer.step(); + lexer.token = T.t_open_brace; + }, + '}' => { + lexer.step(); + lexer.token = T.t_close_brace; + }, + ',' => { + lexer.step(); + lexer.token = T.t_comma; + }, + ':' => { + lexer.step(); + lexer.token = T.t_colon; + }, + ';' => { + lexer.step(); + lexer.token = T.t_semicolon; + }, + '@' => { + lexer.step(); + lexer.token = T.t_at; + }, + '~' => { + lexer.step(); + lexer.token = T.t_tilde; + }, + '?' => { + // '?' or '?.' or '??' or '??=' + lexer.step(); + switch (lexer.code_point) { + '?' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_question_question_equals; + }, + else => { + lexer.token = T.t_question_question; + }, + } + }, - else => { - lexer.token = T.t_percent; - }, - } - }, + '.' => { + lexer.token = T.t_question; + const current = lexer.current; + const contents = lexer.source.contents; + + // Lookahead to disambiguate with 'a?.1:b' + if (current < contents.len) { + const c = contents[current]; + if (c < '0' or c > '9') { + lexer.step(); + lexer.token = T.t_question_dot; + } + } + }, + else => { + lexer.token = T.t_question; + }, + } + }, + '%' => { + // '%' or '%=' + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_percent_equals; + }, - '&' => { - // '&' or '&=' or '&&' or '&&=' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_ampersand_equals; - }, + else => { + lexer.token = T.t_percent; + }, + } + }, - '&' => { - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_ampersand_ampersand_equals; - }, + '&' => { + // '&' or '&=' or '&&' or '&&=' + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_ampersand_equals; + }, - else => { - lexer.token = T.t_ampersand_ampersand; - }, - } - }, - else => { - lexer.token = T.t_ampersand; - }, - } - }, + '&' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_ampersand_ampersand_equals; + }, + + else => { + lexer.token = T.t_ampersand_ampersand; + }, + } + }, + else => { + lexer.token = T.t_ampersand; + }, + } + }, - '|' => { + '|' => { - // '|' or '|=' or '||' or '||=' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_bar_equals; - }, - '|' => { - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_bar_bar_equals; - }, + // '|' or '|=' or '||' or '||=' + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_bar_equals; + }, + '|' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_bar_bar_equals; + }, + + else => { + lexer.token = T.t_bar_bar; + }, + } + }, + else => { + lexer.token = T.t_bar; + }, + } + }, - else => { - lexer.token = T.t_bar_bar; - }, - } - }, - else => { - lexer.token = T.t_bar; - }, - } - }, + '^' => { + // '^' or '^=' + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_caret_equals; + }, - '^' => { - // '^' or '^=' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_caret_equals; - }, + else => { + lexer.token = T.t_caret; + }, + } + }, - else => { - lexer.token = T.t_caret; - }, - } - }, + '+' => { + // '+' or '+=' or '++' + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_plus_equals; + }, - '+' => { - // '+' or '+=' or '++' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_plus_equals; - }, + '+' => { + lexer.step(); + lexer.token = T.t_plus_plus; + }, - '+' => { - lexer.step(); - lexer.token = T.t_plus_plus; - }, + else => { + lexer.token = T.t_plus; + }, + } + }, - else => { - lexer.token = T.t_plus; - }, - } - }, + '-' => { + // '+' or '+=' or '++' + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_minus_equals; + }, - '=' => { - // '=' or '=>' or '==' or '===' - lexer.step(); - switch (lexer.code_point) { - '>' => { - lexer.step(); - lexer.token = T.t_equals_greater_than; - }, + '-' => { + lexer.step(); - '=' => { - lexer.step(); - switch (lexer.code_point) { - '=' => { + if (lexer.code_point == '>' and lexer.has_newline_before) { lexer.step(); - lexer.token = T.t_equals_equals_equals; - }, - - else => { - lexer.token = T.t_equals_equals; - }, - } - }, + lexer.log.addRangeWarning(lexer.source, lexer.range(), "Treating \"-->\" as the start of a legacy HTML single-line comment") catch unreachable; + + singleLineHTMLCloseComment: while (true) { + switch (lexer.code_point) { + '\r', '\n', 0x2028, 0x2029 => { + break :singleLineHTMLCloseComment; + }, + -1 => { + break :singleLineHTMLCloseComment; + }, + else => {}, + } + lexer.step(); + } + continue; + } - else => { - lexer.token = T.t_equals; - }, - } - }, + lexer.token = T.t_minus_minus; + }, - '<' => { - // '<' or '<<' or '<=' or '<<=' or '<!--' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_less_than_equals; - }, + else => { + lexer.token = T.t_plus; + }, + } + }, - '<' => { - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_less_than_less_than_equals; - }, + '*' => { + // '*' or '*=' or '**' or '**=' - else => { - lexer.token = T.t_less_than_less_than; - }, - } - }, - // Handle legacy HTML-style comments - '!' => { - if (std.mem.eql(u8, lexer.peek("--".len), "--")) { - lexer.addUnsupportedSyntaxError("Legacy HTML comments not implemented yet!"); - return; - } - - lexer.token = T.t_less_than; - }, + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = .t_asterisk_equals; + }, + '*' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = .t_asterisk_asterisk_equals; + }, + else => { + lexer.token = .t_asterisk_asterisk; + }, + } + }, + else => { + lexer.token = .t_asterisk; + }, + } + }, + '/' => { + // '/' or '/=' or '//' or '/* ... */' + lexer.step(); + // TODO: forGlobalName - else => { - lexer.token = T.t_less_than; - }, - } - }, + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = .t_slash_equals; + }, + '/' => { + lexer.step(); + singleLineComment: while (true) { + lexer.step(); + switch (lexer.code_point) { + '\r', '\n', 0x2028, 0x2029 => { + break :singleLineComment; + }, + -1 => { + break :singleLineComment; + }, + else => {}, + } + } - '>' => { - // '>' or '>>' or '>>>' or '>=' or '>>=' or '>>>=' - lexer.step(); + if (jsonOptions) |json| { + if (!json.allow_comments) { + lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); + return; + } + } + lexer.scanCommentText(); + continue; + }, + '*' => { + lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_greater_than_equals; - }, - '>' => { - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_greater_than_greater_than_equals; - }, - '>' => { - lexer.step(); + multiLineComment: while (true) { switch (lexer.code_point) { - '=' => { + '*' => { + lexer.step(); + if (lexer.code_point == '/') { + lexer.step(); + break :multiLineComment; + } + }, + '\r', '\n', 0x2028, 0x2029 => { lexer.step(); - lexer.token = T.t_greater_than_greater_than_greater_than_equals; + lexer.has_newline_before = true; + }, + -1 => { + lexer.start = lexer.end; + lexer.addError(lexer.start, "Expected \"*/\" to terminate multi-line comment", .{}, true); }, else => { - lexer.token = T.t_greater_than_greater_than_greater_than; + lexer.step(); }, } - }, - else => { - lexer.token = T.t_greater_than_greater_than; - }, - } - }, - else => { - lexer.token = T.t_greater_than; - }, - } - }, + } + if (jsonOptions) |json| { + if (!json.allow_comments) { + lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); + return; + } + } + lexer.scanCommentText(); + continue; + }, + else => { + lexer.token = .t_slash; + }, + } + }, - '!' => { - // '!' or '!=' or '!==' - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - switch (lexer.code_point) { - '=' => { - lexer.step(); - lexer.token = T.t_exclamation_equals_equals; - }, + '=' => { + // '=' or '=>' or '==' or '===' + lexer.step(); + switch (lexer.code_point) { + '>' => { + lexer.step(); + lexer.token = T.t_equals_greater_than; + }, - else => { - lexer.token = T.t_exclamation_equals; - }, - } - }, - else => { - lexer.token = T.t_exclamation; - }, - } - }, + '=' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_equals_equals_equals; + }, + + else => { + lexer.token = T.t_equals_equals; + }, + } + }, - '\'', '"', '`' => { - lexer.parseStringLiteral(); - }, + else => { + lexer.token = T.t_equals; + }, + } + }, - '_', '$', 'a'...'z', 'A'...'Z' => { - lexer.step(); - while (isIdentifierContinue(lexer.code_point)) { + '<' => { + // '<' or '<<' or '<=' or '<<=' or '<!--' lexer.step(); - } + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_less_than_equals; + }, - if (lexer.code_point == '\\') { - lexer.scanIdentifierWithEscapes(); - } else { - const contents = lexer.raw(); - lexer.identifier = contents; - if (Keywords.get(contents)) |keyword| { - lexer.token = keyword; - } else { - lexer.token = T.t_identifier; + '<' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_less_than_less_than_equals; + }, + + else => { + lexer.token = T.t_less_than_less_than; + }, + } + }, + // Handle legacy HTML-style comments + '!' => { + if (std.mem.eql(u8, lexer.peek("--".len), "--")) { + lexer.addUnsupportedSyntaxError("Legacy HTML comments not implemented yet!"); + return; + } + + lexer.token = T.t_less_than; + }, + + else => { + lexer.token = T.t_less_than; + }, } - } - }, + }, - '\\' => { - // TODO: normal - lexer.scanIdentifierWithEscapes(); - }, + '>' => { + // '>' or '>>' or '>>>' or '>=' or '>>=' or '>>>=' + lexer.step(); - '.', '0'...'9' => { - lexer.parseNumericLiteralOrDot(); - }, + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_greater_than_equals; + }, + '>' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_greater_than_greater_than_equals; + }, + '>' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_greater_than_greater_than_greater_than_equals; + }, + else => { + lexer.token = T.t_greater_than_greater_than_greater_than; + }, + } + }, + else => { + lexer.token = T.t_greater_than_greater_than; + }, + } + }, + else => { + lexer.token = T.t_greater_than; + }, + } + }, - else => { - // Check for unusual whitespace characters - if (isWhitespace(lexer.code_point)) { + '!' => { + // '!' or '!=' or '!==' lexer.step(); - continue; - } + switch (lexer.code_point) { + '=' => { + lexer.step(); + switch (lexer.code_point) { + '=' => { + lexer.step(); + lexer.token = T.t_exclamation_equals_equals; + }, + + else => { + lexer.token = T.t_exclamation_equals; + }, + } + }, + else => { + lexer.token = T.t_exclamation; + }, + } + }, - if (isIdentifierStart(lexer.code_point)) { + '\'', '"', '`' => { + lexer.parseStringLiteral(); + }, + + '_', '$', 'a'...'z', 'A'...'Z' => { lexer.step(); while (isIdentifierContinue(lexer.code_point)) { lexer.step(); } - if (lexer.code_point == '\\') { - // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier); + if (lexer.code_point == '\\') { + lexer.scanIdentifierWithEscapes(); } else { - lexer.token = T.t_identifier; - lexer.identifier = lexer.raw(); + const contents = lexer.raw(); + lexer.identifier = contents; + if (Keywords.get(contents)) |keyword| { + lexer.token = keyword; + } else { + lexer.token = T.t_identifier; + } } - break; - } - - lexer.end = lexer.current; - lexer.token = T.t_syntax_error; - }, - } + }, - return; - } - } + '\\' => { + // TODO: normal + lexer.scanIdentifierWithEscapes(); + }, - pub fn expected(self: *Lexer, token: T) void { - if (tokenToString.get(token).len > 0) { - self.expectedString(tokenToString.get(token)); - } else { - self.unexpected(); - } - } + '.', '0'...'9' => { + lexer.parseNumericLiteralOrDot(); + }, - pub fn unexpected(lexer: *Lexer) void { - var found: string = undefined; - if (lexer.start == lexer.source.contents.len) { - found = "end of file"; - } else { - found = lexer.raw(); - } + else => { + // Check for unusual whitespace characters + if (isWhitespace(lexer.code_point)) { + lexer.step(); + continue; + } - lexer.addRangeError(lexer.range(), "Unexpected {s}", .{found}, true); - } + if (isIdentifierStart(lexer.code_point)) { + lexer.step(); + while (isIdentifierContinue(lexer.code_point)) { + lexer.step(); + } + if (lexer.code_point == '\\') { - pub fn raw(self: *Lexer) []const u8 { - return self.source.contents[self.start..self.end]; - } + // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier); + } else { + lexer.token = T.t_identifier; + lexer.identifier = lexer.raw(); + } + break; + } - pub fn isContextualKeyword(self: *Lexer, comptime keyword: string) bool { - return self.token == .t_identifier and strings.eql(self.raw(), keyword); - } + lexer.end = lexer.current; + lexer.token = T.t_syntax_error; + }, + } - pub fn expectedString(self: *Lexer, text: string) void { - var found = text; - if (self.source.contents.len == self.start) { - found = "end of file"; + return; + } } - self.addRangeError(self.range(), "Expected {s} but found {s}", .{ text, found }, true); - } - - pub fn range(self: *Lexer) logger.Range { - return logger.Range{ - .loc = logger.usize2Loc(self.start), - .len = std.math.lossyCast(i32, self.end - self.start), - }; - } - pub fn init(log: logger.Log, source: logger.Source, allocator: *std.mem.Allocator) !Lexer { - var empty_string_literal: JavascriptString = undefined; - var lex = Lexer{ - .log = log, - .source = source, - .string_literal = empty_string_literal, - .prev_error_loc = logger.Loc.Empty, - .allocator = allocator, - }; - lex.step(); - lex.next(); - - return lex; - } + pub fn expected(self: *@This(), token: T) void { + if (tokenToString.get(token).len > 0) { + self.expectedString(tokenToString.get(token)); + } else { + self.unexpected(); + } + } - pub fn scanRegExp(lexer: *Lexer) void { - while (true) { - switch (lexer.code_point) { - '/' => { - lexer.step(); - while (isIdentifierContinue(lexer.code_point)) { - switch (lexer.code_point) { - 'g', 'i', 'm', 's', 'u', 'y' => { - lexer.step(); - }, - else => { - lexer.syntaxError(); - }, - } - } - }, - '[' => { - lexer.step(); - while (lexer.code_point != ']') { - lexer.scanRegExpValidateAndStep(); - } - lexer.step(); - }, - else => { - lexer.scanRegExpValidateAndStep(); - }, + pub fn unexpected(lexer: *@This()) void { + var found: string = undefined; + if (lexer.start == lexer.source.contents.len) { + found = "end of file"; + } else { + found = lexer.raw(); } + + lexer.addRangeError(lexer.range(), "Unexpected {s}", .{found}, true); } - } - // TODO: use wtf-8 encoding. - pub fn stringToUTF16(lexer: *Lexer, str: string) JavascriptString { - var buf: JavascriptString = lexer.allocator.alloc(u16, std.mem.len(str)) catch unreachable; - var i: usize = 0; - // theres prob a faster/better way - for (str) |char| { - buf[i] = char; - i += 1; + pub fn raw(self: *@This()) []const u8 { + return self.source.contents[self.start..self.end]; } - return buf; - } + pub fn isContextualKeyword(self: *@This(), comptime keyword: string) bool { + return self.token == .t_identifier and strings.eql(self.raw(), keyword); + } - // TODO: use wtf-8 encoding. - pub fn utf16ToStringWithValidation(lexer: *Lexer, js: JavascriptString) !string { - return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js); - } + pub fn expectedString(self: *@This(), text: string) void { + var found = text; + if (self.source.contents.len == self.start) { + found = "end of file"; + } + self.addRangeError(self.range(), "Expected {s} but found {s}", .{ text, found }, true); + } - // TODO: use wtf-8 encoding. - pub fn utf16ToString(lexer: *Lexer, js: JavascriptString) string { - return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js) catch unreachable; - } + pub fn scanCommentText(lexer: *@This()) void { + var text = lexer.source.contents[lexer.start..lexer.end]; + const has_preserve_annotation = text.len > 2 and text[2] == '!'; + const is_multiline_comment = text[1] == '*'; - pub fn nextInsideJSXElement() void { - std.debug.panic("JSX not implemented yet.", .{}); - } + // Omit the trailing "*/" from the checks below + var endCommentText = text.len; + if (is_multiline_comment) { + endCommentText -= 2; + } - fn scanRegExpValidateAndStep(lexer: *Lexer) void { - if (lexer.code_point == '\\') { - lexer.step(); - } + if (has_preserve_annotation or lexer.preserve_all_comments_before) { + if (is_multiline_comment) { + // text = lexer.removeMultilineCommentIndent(lexer.source.contents[0..lexer.start], text); + } - switch (lexer.code_point) { - '\r', '\n', 0x2028, 0x2029 => { - // Newlines aren't allowed in regular expressions - lexer.syntaxError(); - }, - -1 => { // EOF - lexer.syntaxError(); - }, - else => { - lexer.step(); - }, + lexer.comments_to_preserve_before.append(js_ast.G.Comment{ + .text = text, + .loc = lexer.loc(), + }) catch unreachable; + } } - } - pub fn rescanCloseBraceAsTemplateToken(lexer: *Lexer) void { - if (lexer.token != .t_close_brace) { - lexer.expected(.t_close_brace); + // TODO: implement this + // it's too complicated to handle all the edgecases right now given the state of Zig's standard library + pub fn removeMultilineCommentIndent(lexer: *@This(), _prefix: string, text: string) string { + return text; } - lexer.rescan_close_brace_as_template_token = true; - lexer.code_point = '`'; - lexer.current = lexer.end; - lexer.end -= 1; - lexer.next(); - lexer.rescan_close_brace_as_template_token = false; - } - - pub fn rawTemplateContents(lexer: *Lexer) string { - var text: string = undefined; - - switch (lexer.token) { - .t_no_substitution_template_literal, .t_template_tail => { - text = lexer.source.contents[lexer.start + 1 .. lexer.end - 1]; - }, - .t_template_middle, .t_template_head => { - text = lexer.source.contents[lexer.start + 1 .. lexer.end - 2]; - }, - else => {}, + pub fn range(self: *@This()) logger.Range { + return logger.Range{ + .loc = logger.usize2Loc(self.start), + .len = std.math.lossyCast(i32, self.end - self.start), + }; } - if (strings.indexOfChar(text, '\r') == null) { - return text; + pub fn init(log: logger.Log, source: logger.Source, allocator: *std.mem.Allocator) !@This() { + var empty_string_literal: JavascriptString = undefined; + var lex = @This(){ + .log = log, + .source = source, + .string_literal = empty_string_literal, + .prev_error_loc = logger.Loc.Empty, + .allocator = allocator, + .comments_to_preserve_before = std.ArrayList(js_ast.G.Comment).init(allocator), + }; + lex.step(); + lex.next(); + + return lex; } - // From the specification: - // - // 11.8.6.1 Static Semantics: TV and TRV - // - // TV excludes the code units of LineContinuation while TRV includes - // them. <CR><LF> and <CR> LineTerminatorSequences are normalized to - // <LF> for both TV and TRV. An explicit EscapeSequence is needed to - // include a <CR> or <CR><LF> sequence. - var bytes = MutableString.initCopy(lexer.allocator, text) catch unreachable; - var end: usize = 0; - var i: usize = 0; - var c: u8 = '0'; - while (i < bytes.list.items.len) { - c = bytes.list.items[i]; - i += 1; - - if (c == '\r') { - // Convert '\r\n' into '\n' - if (i < bytes.list.items.len and bytes.list.items[i] == '\n') { - i += 1; + pub fn scanRegExp(lexer: *@This()) void { + while (true) { + switch (lexer.code_point) { + '/' => { + lexer.step(); + while (isIdentifierContinue(lexer.code_point)) { + switch (lexer.code_point) { + 'g', 'i', 'm', 's', 'u', 'y' => { + lexer.step(); + }, + else => { + lexer.syntaxError(); + }, + } + } + }, + '[' => { + lexer.step(); + while (lexer.code_point != ']') { + lexer.scanRegExpValidateAndStep(); + } + lexer.step(); + }, + else => { + lexer.scanRegExpValidateAndStep(); + }, } + } + } - // Convert '\r' into '\n' - c = '\n'; + // TODO: use wtf-8 encoding. + pub fn stringToUTF16(lexer: *@This(), str: string) JavascriptString { + var buf: JavascriptString = lexer.allocator.alloc(u16, std.mem.len(str)) catch unreachable; + var i: usize = 0; + // theres prob a faster/better way + for (str) |char| { + buf[i] = char; + i += 1; } - bytes.list.items[end] = c; - end += 1; + return buf; } - return bytes.toOwnedSliceLength(end + 1); - } - - fn parseNumericLiteralOrDot(lexer: *Lexer) void { - // Number or dot; - var first = lexer.code_point; - lexer.step(); - - // Dot without a digit after it; - if (first == '.' and (lexer.code_point < '0' or lexer.code_point > '9')) { - // "..." - if ((lexer.code_point == '.' and - lexer.current < lexer.source.contents.len) and - lexer.source.contents[lexer.current] == '.') - { - lexer.step(); - lexer.step(); - lexer.token = T.t_dot_dot_dot; - return; - } + // TODO: use wtf-8 encoding. + pub fn utf16ToStringWithValidation(lexer: *@This(), js: JavascriptString) !string { + return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js); + } - // "." - lexer.token = T.t_dot; - return; + // TODO: use wtf-8 encoding. + pub fn utf16ToString(lexer: *@This(), js: JavascriptString) string { + return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js) catch unreachable; } - var underscoreCount: usize = 0; - var lastUnderscoreEnd: usize = 0; - var hasDotOrExponent = first == '.'; - var base: f32 = 0.0; - lexer.is_legacy_octal_literal = false; + pub fn nextInsideJSXElement() void { + std.debug.panic("JSX not implemented yet.", .{}); + } - // Assume this is a number, but potentially change to a bigint later; - lexer.token = T.t_numeric_literal; + fn scanRegExpValidateAndStep(lexer: *@This()) void { + if (lexer.code_point == '\\') { + lexer.step(); + } - // Check for binary, octal, or hexadecimal literal; - if (first == '0') { switch (lexer.code_point) { - 'b', 'B' => { - base = 2; + '\r', '\n', 0x2028, 0x2029 => { + // Newlines aren't allowed in regular expressions + lexer.syntaxError(); }, - - 'o', 'O' => { - base = 8; + -1 => { // EOF + lexer.syntaxError(); }, - - 'x', 'X' => { - base = 16; + else => { + lexer.step(); }, + } + } + + pub fn rescanCloseBraceAsTemplateToken(lexer: *@This()) void { + if (lexer.token != .t_close_brace) { + lexer.expected(.t_close_brace); + } - '0'...'7', '_' => { - base = 8; - lexer.is_legacy_octal_literal = true; + lexer.rescan_close_brace_as_template_token = true; + lexer.code_point = '`'; + lexer.current = lexer.end; + lexer.end -= 1; + lexer.next(); + lexer.rescan_close_brace_as_template_token = false; + } + + pub fn rawTemplateContents(lexer: *@This()) string { + var text: string = undefined; + + switch (lexer.token) { + .t_no_substitution_template_literal, .t_template_tail => { + text = lexer.source.contents[lexer.start + 1 .. lexer.end - 1]; + }, + .t_template_middle, .t_template_head => { + text = lexer.source.contents[lexer.start + 1 .. lexer.end - 2]; }, else => {}, } + + if (strings.indexOfChar(text, '\r') == null) { + return text; + } + + // From the specification: + // + // 11.8.6.1 Static Semantics: TV and TRV + // + // TV excludes the code units of LineContinuation while TRV includes + // them. <CR><LF> and <CR> LineTerminatorSequences are normalized to + // <LF> for both TV and TRV. An explicit EscapeSequence is needed to + // include a <CR> or <CR><LF> sequence. + var bytes = MutableString.initCopy(lexer.allocator, text) catch unreachable; + var end: usize = 0; + var i: usize = 0; + var c: u8 = '0'; + while (i < bytes.list.items.len) { + c = bytes.list.items[i]; + i += 1; + + if (c == '\r') { + // Convert '\r\n' into '\n' + if (i < bytes.list.items.len and bytes.list.items[i] == '\n') { + i += 1; + } + + // Convert '\r' into '\n' + c = '\n'; + } + + bytes.list.items[end] = c; + end += 1; + } + + return bytes.toOwnedSliceLength(end + 1); } - if (base != 0) { - // Integer literal; - var isFirst = true; - var isInvalidLegacyOctalLiteral = false; - lexer.number = 0; - if (!lexer.is_legacy_octal_literal) { - lexer.step(); + fn parseNumericLiteralOrDot(lexer: *@This()) void { + // Number or dot; + var first = lexer.code_point; + lexer.step(); + + // Dot without a digit after it; + if (first == '.' and (lexer.code_point < '0' or lexer.code_point > '9')) { + // "..." + if ((lexer.code_point == '.' and + lexer.current < lexer.source.contents.len) and + lexer.source.contents[lexer.current] == '.') + { + lexer.step(); + lexer.step(); + lexer.token = T.t_dot_dot_dot; + return; + } + + // "." + lexer.token = T.t_dot; + return; } - integerLiteral: while (true) { - switch (lexer.code_point) { - '_' => { - // Cannot have multiple underscores in a row; - if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { - lexer.syntaxError(); - } + var underscoreCount: usize = 0; + var lastUnderscoreEnd: usize = 0; + var hasDotOrExponent = first == '.'; + var base: f32 = 0.0; + lexer.is_legacy_octal_literal = false; - // The first digit must exist; - if (isFirst or lexer.is_legacy_octal_literal) { - lexer.syntaxError(); - } + // Assume this is a number, but potentially change to a bigint later; + lexer.token = T.t_numeric_literal; - lastUnderscoreEnd = lexer.end; - underscoreCount += 1; + // Check for binary, octal, or hexadecimal literal; + if (first == '0') { + switch (lexer.code_point) { + 'b', 'B' => { + base = 2; }, - '0', '1' => { - lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + 'o', 'O' => { + base = 8; }, - '2', '3', '4', '5', '6', '7' => { - if (base == 2) { - lexer.syntaxError(); - } - lexer.number = lexer.number * base + float64(lexer.code_point - '0'); - }, - '8', '9' => { - if (lexer.is_legacy_octal_literal) { - isInvalidLegacyOctalLiteral = true; - } else if (base < 10) { - lexer.syntaxError(); - } - lexer.number = lexer.number * base + float64(lexer.code_point - '0'); - }, - 'A', 'B', 'C', 'D', 'E', 'F' => { - if (base != 16) { - lexer.syntaxError(); - } - lexer.number = lexer.number * base + float64(lexer.code_point + 10 - 'A'); + 'x', 'X' => { + base = 16; }, - 'a', 'b', 'c', 'd', 'e', 'f' => { - if (base != 16) { - lexer.syntaxError(); - } - lexer.number = lexer.number * base + float64(lexer.code_point + 10 - 'a'); + '0'...'7', '_' => { + base = 8; + lexer.is_legacy_octal_literal = true; }, - else => { - // The first digit must exist; - if (isFirst) { - lexer.syntaxError(); - } + else => {}, + } + } - break :integerLiteral; - }, + if (base != 0) { + // Integer literal; + var isFirst = true; + var isInvalidLegacyOctalLiteral = false; + lexer.number = 0; + if (!lexer.is_legacy_octal_literal) { + lexer.step(); } - lexer.step(); - isFirst = false; - } + integerLiteral: while (true) { + switch (lexer.code_point) { + '_' => { + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } - var isBigIntegerLiteral = lexer.code_point == 'n' and !hasDotOrExponent; + // The first digit must exist; + if (isFirst or lexer.is_legacy_octal_literal) { + lexer.syntaxError(); + } - // Slow path: do we need to re-scan the input as text? - if (isBigIntegerLiteral or isInvalidLegacyOctalLiteral) { - var text = lexer.raw(); + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; + }, - // Can't use a leading zero for bigint literals; - if (isBigIntegerLiteral and lexer.is_legacy_octal_literal) { - lexer.syntaxError(); - } + '0', '1' => { + lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + }, - // Filter out underscores; - if (underscoreCount > 0) { - var bytes = lexer.allocator.alloc(u8, text.len - underscoreCount) catch unreachable; - var i: usize = 0; - for (text) |char| { - if (char != '_') { - bytes[i] = char; - i += 1; - } - } - } + '2', '3', '4', '5', '6', '7' => { + if (base == 2) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + }, + '8', '9' => { + if (lexer.is_legacy_octal_literal) { + isInvalidLegacyOctalLiteral = true; + } else if (base < 10) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + }, + 'A', 'B', 'C', 'D', 'E', 'F' => { + if (base != 16) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point + 10 - 'A'); + }, - // Store bigints as text to avoid precision loss; - if (isBigIntegerLiteral) { - lexer.identifier = text; - } else if (isInvalidLegacyOctalLiteral) { - if (std.fmt.parseFloat(f64, text)) |num| { - lexer.number = num; - } else |err| { - lexer.addError(lexer.start, "Invalid number {s}", .{text}, true); + 'a', 'b', 'c', 'd', 'e', 'f' => { + if (base != 16) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point + 10 - 'a'); + }, + else => { + // The first digit must exist; + if (isFirst) { + lexer.syntaxError(); + } + + break :integerLiteral; + }, } + + lexer.step(); + isFirst = false; } - } - } else { - // Floating-point literal; - var isInvalidLegacyOctalLiteral = first == '0' and (lexer.code_point == '8' or lexer.code_point == '9'); - // Initial digits; - while (true) { - if (lexer.code_point < '0' or lexer.code_point > '9') { - if (lexer.code_point != '_') { - break; - } + var isBigIntegerLiteral = lexer.code_point == 'n' and !hasDotOrExponent; - // Cannot have multiple underscores in a row; - if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { - lexer.syntaxError(); - } + // Slow path: do we need to re-scan the input as text? + if (isBigIntegerLiteral or isInvalidLegacyOctalLiteral) { + var text = lexer.raw(); - // The specification forbids underscores in this case; - if (isInvalidLegacyOctalLiteral) { + // Can't use a leading zero for bigint literals; + if (isBigIntegerLiteral and lexer.is_legacy_octal_literal) { lexer.syntaxError(); } - lastUnderscoreEnd = lexer.end; - underscoreCount += 1; - } - lexer.step(); - } + // Filter out underscores; + if (underscoreCount > 0) { + var bytes = lexer.allocator.alloc(u8, text.len - underscoreCount) catch unreachable; + var i: usize = 0; + for (text) |char| { + if (char != '_') { + bytes[i] = char; + i += 1; + } + } + } - // Fractional digits; - if (first != '.' and lexer.code_point == '.') { - // An underscore must not come last; - if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { - lexer.end -= 1; - lexer.syntaxError(); + // Store bigints as text to avoid precision loss; + if (isBigIntegerLiteral) { + lexer.identifier = text; + } else if (isInvalidLegacyOctalLiteral) { + if (std.fmt.parseFloat(f64, text)) |num| { + lexer.number = num; + } else |err| { + lexer.addError(lexer.start, "Invalid number {s}", .{text}, true); + } + } } + } else { + // Floating-point literal; + var isInvalidLegacyOctalLiteral = first == '0' and (lexer.code_point == '8' or lexer.code_point == '9'); - hasDotOrExponent = true; - lexer.step(); - if (lexer.code_point == '_') { - lexer.syntaxError(); - } + // Initial digits; while (true) { if (lexer.code_point < '0' or lexer.code_point > '9') { if (lexer.code_point != '_') { @@ -1150,110 +1291,151 @@ pub const Lexer = struct { lexer.syntaxError(); } + // The specification forbids underscores in this case; + if (isInvalidLegacyOctalLiteral) { + lexer.syntaxError(); + } + lastUnderscoreEnd = lexer.end; underscoreCount += 1; } lexer.step(); } - } - // Exponent; - if (lexer.code_point == 'e' or lexer.code_point == 'E') { - // An underscore must not come last; - if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { - lexer.end -= 1; - lexer.syntaxError(); - } + // Fractional digits; + if (first != '.' and lexer.code_point == '.') { + // An underscore must not come last; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.end -= 1; + lexer.syntaxError(); + } - hasDotOrExponent = true; - lexer.step(); - if (lexer.code_point == '+' or lexer.code_point == '-') { + hasDotOrExponent = true; lexer.step(); - } - if (lexer.code_point < '0' or lexer.code_point > '9') { - lexer.syntaxError(); - } - while (true) { - if (lexer.code_point < '0' or lexer.code_point > '9') { - if (lexer.code_point != '_') { - break; - } + if (lexer.code_point == '_') { + lexer.syntaxError(); + } + while (true) { + if (lexer.code_point < '0' or lexer.code_point > '9') { + if (lexer.code_point != '_') { + break; + } - // Cannot have multiple underscores in a row; - if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { - lexer.syntaxError(); + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } + + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; } + lexer.step(); + } + } - lastUnderscoreEnd = lexer.end; - underscoreCount += 1; + // Exponent; + if (lexer.code_point == 'e' or lexer.code_point == 'E') { + // An underscore must not come last; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.end -= 1; + lexer.syntaxError(); } + + hasDotOrExponent = true; lexer.step(); - } - } + if (lexer.code_point == '+' or lexer.code_point == '-') { + lexer.step(); + } + if (lexer.code_point < '0' or lexer.code_point > '9') { + lexer.syntaxError(); + } + while (true) { + if (lexer.code_point < '0' or lexer.code_point > '9') { + if (lexer.code_point != '_') { + break; + } - // Take a slice of the text to parse; - var text = lexer.raw(); + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } - // Filter out underscores; - if (underscoreCount > 0) { - var i: usize = 0; - if (lexer.allocator.alloc(u8, text.len - underscoreCount)) |bytes| { - for (text) |char| { - if (char != '_') { - bytes[i] = char; - i += 1; + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; } + lexer.step(); } - text = bytes; - } else |err| { - lexer.addError(lexer.start, "Out of Memory Wah Wah Wah", .{}, true); - return; } - } - if (lexer.code_point == 'n' and !hasDotOrExponent) { - // The only bigint literal that can start with 0 is "0n" - if (text.len > 1 and first == '0') { - lexer.syntaxError(); - } + // Take a slice of the text to parse; + var text = lexer.raw(); - // Store bigints as text to avoid precision loss; - lexer.identifier = text; - } else if (!hasDotOrExponent and lexer.end - lexer.start < 10) { - // Parse a 32-bit integer (very fast path); - var number: u32 = 0; - for (text) |c| { - number = number * 10 + @intCast(u32, c - '0'); + // Filter out underscores; + if (underscoreCount > 0) { + var i: usize = 0; + if (lexer.allocator.alloc(u8, text.len - underscoreCount)) |bytes| { + for (text) |char| { + if (char != '_') { + bytes[i] = char; + i += 1; + } + } + text = bytes; + } else |err| { + lexer.addError(lexer.start, "Out of Memory Wah Wah Wah", .{}, true); + return; + } } - lexer.number = @intToFloat(f64, number); - } else { - // Parse a double-precision floating-point number; - if (std.fmt.parseFloat(f64, text)) |num| { - lexer.number = num; - } else |err| { - lexer.addError(lexer.start, "Invalid number", .{}, true); + + if (lexer.code_point == 'n' and !hasDotOrExponent) { + // The only bigint literal that can start with 0 is "0n" + if (text.len > 1 and first == '0') { + lexer.syntaxError(); + } + + // Store bigints as text to avoid precision loss; + lexer.identifier = text; + } else if (!hasDotOrExponent and lexer.end - lexer.start < 10) { + // Parse a 32-bit integer (very fast path); + var number: u32 = 0; + for (text) |c| { + number = number * 10 + @intCast(u32, c - '0'); + } + lexer.number = @intToFloat(f64, number); + } else { + // Parse a double-precision floating-point number; + if (std.fmt.parseFloat(f64, text)) |num| { + lexer.number = num; + } else |err| { + lexer.addError(lexer.start, "Invalid number", .{}, true); + } } } - } - // An underscore must not come last; - if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { - lexer.end -= 1; - lexer.syntaxError(); - } + // An underscore must not come last; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.end -= 1; + lexer.syntaxError(); + } - // Handle bigint literals after the underscore-at-end check above; - if (lexer.code_point == 'n' and !hasDotOrExponent) { - lexer.token = T.t_big_integer_literal; - lexer.step(); - } + // Handle bigint literals after the underscore-at-end check above; + if (lexer.code_point == 'n' and !hasDotOrExponent) { + lexer.token = T.t_big_integer_literal; + lexer.step(); + } - // Identifiers can't occur immediately after numbers; - if (isIdentifierStart(lexer.code_point)) { - lexer.syntaxError(); + // Identifiers can't occur immediately after numbers; + if (isIdentifierStart(lexer.code_point)) { + lexer.syntaxError(); + } } - } -}; + }; +} + +// JS/TS lexer +pub const Lexer = NewLexerType(null); +pub const JSONLexer = NewLexerType(JSONOptions{ .allow_comments = false, .allow_trailing_commas = false }); +pub const TSConfigJSONLexer = NewLexerType(JSONOptions{ .allow_comments = true, .allow_trailing_commas = true }); fn isIdentifierStart(codepoint: CodePoint) bool { switch (codepoint) { @@ -1309,10 +1491,10 @@ fn isWhitespace(codepoint: CodePoint) bool { 0x202F, // narrow no-break space 0x205F, // medium mathematical space 0x3000, // ideographic space - 0xFEFF, + 0xFEFF, // zero width non-breaking space => { return true; - }, // zero width non-breaking space + }, else => { return false; }, @@ -1346,7 +1528,7 @@ fn float64(num: anytype) callconv(.Inline) f64 { return @intToFloat(f64, num); } -fn test_lexer(contents: []const u8) Lexer { +fn test_lexer(contents: []const u8) @This() { alloc.setup(std.heap.page_allocator) catch unreachable; const msgs = std.ArrayList(logger.Msg).init(alloc.dynamic); const log = logger.Log{ @@ -1357,10 +1539,10 @@ fn test_lexer(contents: []const u8) Lexer { "index.js", contents, ); - return Lexer.init(log, source, alloc.dynamic) catch unreachable; + return @This().init(log, source, alloc.dynamic) catch unreachable; } -// test "Lexer.next()" { +// test "@This().next()" { // try alloc.setup(std.heap.page_allocator); // const msgs = std.ArrayList(logger.Msg).init(alloc.dynamic); // const log = logger.Log{ @@ -1368,7 +1550,7 @@ fn test_lexer(contents: []const u8) Lexer { // }; // const source = logger.Source.initPathString("index.js", "for (let i = 0; i < 100; i++) { console.log('hi'); }", std.heap.page_allocator); -// var lex = try Lexer.init(log, source, alloc.dynamic); +// var lex = try @This().init(log, source, alloc.dynamic); // lex.next(); // } @@ -1411,7 +1593,7 @@ pub fn test_stringLiteralEquals(expected: string, source_text: string) void { source_text, ); - var lex = try Lexer.init(log, source, std.heap.page_allocator); + var lex = try @This().init(log, source, std.heap.page_allocator); while (!lex.token.isString() and lex.token != .t_end_of_file) { lex.next(); } @@ -1420,7 +1602,7 @@ pub fn test_stringLiteralEquals(expected: string, source_text: string) void { std.testing.expectEqualStrings(expected, lit); } -pub fn test_skipTo(lexer: *Lexer, n: string) void { +pub fn test_skipTo(lexer: *@This(), n: string) void { var i: usize = 0; while (i < n.len) { lexer.next(); @@ -1428,7 +1610,7 @@ pub fn test_skipTo(lexer: *Lexer, n: string) void { } } -test "Lexer.rawTemplateContents" { +test "@This().rawTemplateContents" { test_stringLiteralEquals("hello!", "const a = 'hello!';"); test_stringLiteralEquals("hello!hi", "const b = 'hello!hi';"); test_stringLiteralEquals("hello!\n\nhi", "const b = `hello!\n\nhi`;"); |