const std = @import("std"); const logger = @import("logger.zig"); const tables = @import("js_lexer_tables.zig"); const alloc = @import("alloc.zig"); const build_options = @import("build_options"); const js_ast = @import("js_ast.zig"); usingnamespace @import("ast/base.zig"); usingnamespace @import("global.zig"); const unicode = std.unicode; const Source = logger.Source; pub const T = tables.T; pub const CodePoint = tables.CodePoint; pub const Keywords = tables.Keywords; pub const tokenToString = tables.tokenToString; pub const StrictModeReservedWords = tables.StrictModeReservedWords; pub const PropertyModifierKeyword = tables.PropertyModifierKeyword; pub const TypescriptStmtKeyword = tables.TypescriptStmtKeyword; pub const TypeScriptAccessibilityModifier = tables.TypeScriptAccessibilityModifier; fn notimpl() noreturn { Global.panic("not implemented yet!", .{}); } pub var emptyJavaScriptString = ([_]u16{0}); pub const JSONOptions = struct { allow_comments: bool = false, allow_trailing_commas: bool = false, }; pub const Lexer = struct { const LexerType = @This(); pub const Error = error{ UTF8Fail, OutOfMemory, SyntaxError, UnexpectedSyntax, JSONStringsMustUseDoubleQuotes, ParserError, }; // pub const Error = error{ // UnexpectedToken, // EndOfFile, // }; // err: ?LexerType.Error, log: *logger.Log, json_options: ?JSONOptions = null, for_global_name: bool = false, source: *const logger.Source, current: usize = 0, start: usize = 0, end: usize = 0, did_panic: bool = false, approximate_newline_count: i32 = 0, legacy_octal_loc: logger.Loc = logger.Loc.Empty, previous_backslash_quote_in_jsx: logger.Range = logger.Range.None, token: T = T.t_end_of_file, has_newline_before: bool = false, has_pure_comment_before: bool = false, preserve_all_comments_before: bool = false, is_legacy_octal_literal: bool = false, comments_to_preserve_before: std.ArrayList(js_ast.G.Comment), all_original_comments: ?[]js_ast.G.Comment = null, code_point: CodePoint = -1, identifier: []const u8 = "", jsx_factory_pragma_comment: ?js_ast.Span = null, jsx_fragment_pragma_comment: ?js_ast.Span = null, source_mapping_url: ?js_ast.Span = null, number: f64 = 0.0, rescan_close_brace_as_template_token: bool = false, prev_error_loc: logger.Loc = logger.Loc.Empty, allocator: *std.mem.Allocator, /// In JavaScript, strings are stored as UTF-16, but nearly every string is ascii. /// This means, usually, we can skip UTF8 -> UTF16 conversions. string_literal_buffer: std.ArrayList(u16), string_literal_slice: string = "", string_literal: JavascriptString, string_literal_is_ascii: bool = false, pub fn loc(self: *LexerType) logger.Loc { return logger.usize2Loc(self.start); } fn nextCodepointSlice(it: *LexerType) callconv(.Inline) !?[]const u8 { if (it.current >= it.source.contents.len) { // without this line, strings cut off one before the last characte it.end = it.current; return null; } const cp_len = unicode.utf8ByteSequenceLength(it.source.contents[it.current]) catch return Error.UTF8Fail; it.end = it.current; it.current += cp_len; return it.source.contents[it.current - cp_len .. it.current]; } pub fn syntaxError(self: *LexerType) !void { self.addError(self.start, "Syntax Error!!", .{}, true); return Error.SyntaxError; } pub fn addDefaultError(self: *LexerType, msg: []const u8) !void { self.addError(self.start, "{s}", .{msg}, true); return Error.SyntaxError; } pub fn addError(self: *LexerType, _loc: usize, comptime format: []const u8, args: anytype, panic: bool) void { var __loc = logger.usize2Loc(_loc); if (__loc.eql(self.prev_error_loc)) { return; } self.log.addErrorFmt(self.source, __loc, self.allocator, format, args) catch unreachable; self.prev_error_loc = __loc; var msg = self.log.msgs.items[self.log.msgs.items.len - 1]; msg.formatNoWriter(Global.panic); } pub fn addRangeError(self: *LexerType, r: logger.Range, comptime format: []const u8, args: anytype, panic: bool) !void { if (self.prev_error_loc.eql(r.loc)) { return; } const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; var msg = self.log.addRangeError(self.source, r, errorMessage); self.prev_error_loc = r.loc; if (panic) { return Error.ParserError; } } fn doPanic(self: *LexerType, content: []const u8) void { if (@import("builtin").is_test) { self.did_panic = true; } else { Global.panic("{s}", .{content}); } } pub fn codePointEql(self: *LexerType, a: u8) bool { return @intCast(CodePoint, a) == self.code_point; } fn nextCodepoint(it: *LexerType) callconv(.Inline) !CodePoint { const slice = (try it.nextCodepointSlice()) orelse return @as(CodePoint, -1); switch (slice.len) { 1 => return @as(CodePoint, slice[0]), 2 => return @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable), 3 => return @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable), 4 => return @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable), else => unreachable, } } /// Look ahead at the next n codepoints without advancing the iterator. /// If fewer than n codepoints are available, then return the remainder of the string. fn peek(it: *LexerType, n: usize) !string { const original_i = it.current; defer it.current = original_i; var end_ix = original_i; var found: usize = 0; while (found < n) : (found += 1) { const next_codepoint = (try it.nextCodepointSlice()) orelse return it.source.contents[original_i..]; end_ix += next_codepoint.len; } return it.source.contents[original_i..end_ix]; } pub fn isIdentifierOrKeyword(lexer: LexerType) bool { return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier); } pub fn stringLiteralUTF16(lexer: *LexerType) JavascriptString { if (lexer.string_literal_is_ascii) { return lexer.stringToUTF16(lexer.string_literal_slice); } else { return lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable; } } fn parseStringLiteral(lexer: *LexerType) !void { var quote: CodePoint = lexer.code_point; var needs_slow_path = false; var suffixLen: usize = 1; if (quote != '`') { lexer.token = T.t_string_literal; } else if (lexer.rescan_close_brace_as_template_token) { lexer.token = T.t_template_tail; } else { lexer.token = T.t_no_substitution_template_literal; } try lexer.step(); stringLiteral: while (true) { switch (lexer.code_point) { '\\' => { needs_slow_path = true; try lexer.step(); // Handle Windows CRLF if (lexer.code_point == '\r' and lexer.json_options != null) { try lexer.step(); if (lexer.code_point == '\n') { try lexer.step(); } continue :stringLiteral; } }, // This indicates the end of the file -1 => { try lexer.addDefaultError("Unterminated string literal"); }, '\r' => { if (quote != '`') { try lexer.addDefaultError("Unterminated string literal"); } // Template literals require newline normalization needs_slow_path = true; }, '\n' => { if (quote != '`') { try lexer.addDefaultError("Unterminated string literal"); } }, '$' => { if (quote == '`') { try lexer.step(); if (lexer.code_point == '{') { suffixLen = 2; try lexer.step(); if (lexer.rescan_close_brace_as_template_token) { lexer.token = T.t_template_middle; } else { lexer.token = T.t_template_head; } break :stringLiteral; } continue :stringLiteral; } }, else => { if (quote == lexer.code_point) { try lexer.step(); break :stringLiteral; } // Non-ASCII strings need the slow path if (lexer.code_point >= 0x80) { needs_slow_path = true; } else if (lexer.json_options != null and lexer.code_point < 0x20) { try lexer.syntaxError(); } }, } try lexer.step(); } // Reset string literal lexer.string_literal = &([_]u16{}); lexer.string_literal_slice = lexer.source.contents[lexer.start + 1 .. lexer.end - suffixLen]; lexer.string_literal_is_ascii = !needs_slow_path; lexer.string_literal_buffer.shrinkRetainingCapacity(0); lexer.string_literal.len = lexer.string_literal_slice.len; if (needs_slow_path) { lexer.string_literal_buffer.ensureTotalCapacity(lexer.string_literal_slice.len) catch unreachable; var slice = lexer.string_literal_buffer.allocatedSlice(); lexer.string_literal_buffer.items = slice[0..strings.toUTF16Buf(lexer.string_literal_slice, slice)]; lexer.string_literal = lexer.string_literal_buffer.items; lexer.string_literal_slice = &[_]u8{}; } if (quote == '\'' and lexer.json_options != null) { try lexer.addRangeError(lexer.range(), "JSON strings must use double quotes", .{}, true); } // for (text) // // if (needs_slow_path) { // // // Slow path // // // lexer.string_literal = lexer.(lexer.start + 1, text); // // } else { // // // Fast path // // } } fn step(lexer: *LexerType) !void { lexer.code_point = try lexer.nextCodepoint(); // Track the approximate number of newlines in the file so we can preallocate // the line offset table in the printer for source maps. The line offset table // is the #1 highest allocation in the heap profile, so this is worth doing. // This count is approximate because it handles "\n" and "\r\n" (the common // cases) but not "\r" or "\u2028" or "\u2029". Getting this wrong is harmless // because it's only a preallocation. The array will just grow if it's too small. if (lexer.code_point == '\n') { lexer.approximate_newline_count += 1; } } pub fn expect(self: *LexerType, comptime token: T) !void { if (self.token != token) { try self.expected(token); } try self.next(); } pub fn expectOrInsertSemicolon(lexer: *LexerType) !void { if (lexer.token == T.t_semicolon or (!lexer.has_newline_before and lexer.token != T.t_close_brace and lexer.token != T.t_end_of_file)) { try lexer.expect(T.t_semicolon); } } pub fn addUnsupportedSyntaxError(self: *LexerType, msg: []const u8) !void { self.addError(self.end, "Unsupported syntax: {s}", .{msg}, true); return Error.SyntaxError; } pub fn scanIdentifierWithEscapes(self: *LexerType) !void { try self.addUnsupportedSyntaxError("escape sequence"); } pub fn debugInfo(self: *LexerType) void { if (self.log.errors > 0) { const stderr = std.io.getStdErr().writer(); self.log.print(stderr) catch unreachable; } else { if (self.token == T.t_identifier or self.token == T.t_string_literal) { Output.print(" {s} ", .{self.raw()}); } else { Output.print(" <{s}> ", .{tokenToString.get(self.token)}); } } } pub fn expectContextualKeyword(self: *LexerType, comptime keyword: string) !void { if (!self.isContextualKeyword(keyword)) { if (std.builtin.mode == std.builtin.Mode.Debug) { self.addError(self.start, "Expected \"{s}\" but found \"{s}\" (token: {s})", .{ keyword, self.raw(), self.token, }, true); } else { self.addError(self.start, "Expected \"{s}\" but found \"{s}\"", .{ keyword, self.raw() }, true); } return Error.UnexpectedSyntax; } try self.next(); } pub fn maybeExpandEquals(lexer: *LexerType) !void { switch (lexer.code_point) { '>' => { // "=" + ">" = "=>" lexer.token = .t_equals_greater_than; try lexer.step(); }, '=' => { // "=" + "=" = "==" lexer.token = .t_equals_equals; try lexer.step(); if (lexer.code_point == '=') { // "=" + "==" = "===" lexer.token = .t_equals_equals_equals; try lexer.step(); } }, else => {}, } } pub fn expectLessThan(lexer: *LexerType, is_inside_jsx_element: bool) !void { switch (lexer.token) { .t_less_than => { if (is_inside_jsx_element) { try lexer.nextInsideJSXElement(); } else { try lexer.next(); } }, .t_less_than_equals => { lexer.token = .t_equals; lexer.start += 1; try lexer.maybeExpandEquals(); }, .t_less_than_less_than => { lexer.token = .t_less_than; lexer.start += 1; }, .t_less_than_less_than_equals => { lexer.token = .t_less_than_equals; lexer.start += 1; }, else => { try lexer.expected(.t_less_than); }, } } pub fn expectGreaterThan(lexer: *LexerType, is_inside_jsx_element: bool) !void { switch (lexer.token) { .t_greater_than => { if (is_inside_jsx_element) { try lexer.nextInsideJSXElement(); } else { try lexer.next(); } }, .t_greater_than_equals => { lexer.token = .t_equals; lexer.start += 1; try lexer.maybeExpandEquals(); }, .t_greater_than_greater_than => { lexer.token = .t_greater_than; lexer.start += 1; }, .t_greater_than_greater_than_equals => { lexer.token = .t_greater_than_greater_than; lexer.start += 1; }, .t_greater_than_greater_than_greater_than => { lexer.token = .t_greater_than_greater_than_equals; lexer.start += 1; }, else => { try lexer.expected(.t_greater_than); }, } } pub fn next(lexer: *LexerType) !void { lexer.has_newline_before = lexer.end == 0; lex: while (true) { lexer.start = lexer.end; lexer.token = T.t_end_of_file; switch (lexer.code_point) { -1 => { lexer.token = T.t_end_of_file; }, '#' => { if (lexer.start == 0 and lexer.source.contents[1] == '!') { try lexer.addUnsupportedSyntaxError("#!hashbang is not supported yet."); return; } try lexer.step(); if (!isIdentifierStart(lexer.code_point)) { try lexer.syntaxError(); } try lexer.step(); if (isIdentifierStart(lexer.code_point)) { try lexer.step(); while (isIdentifierContinue(lexer.code_point)) { try lexer.step(); } if (lexer.code_point == '\\') { try lexer.scanIdentifierWithEscapes(); lexer.token = T.t_private_identifier; // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier); } else { lexer.token = T.t_private_identifier; lexer.identifier = lexer.raw(); } break; } }, '\r', '\n', 0x2028, 0x2029 => { try lexer.step(); lexer.has_newline_before = true; continue; }, '\t', ' ' => { try lexer.step(); continue; }, '(' => { try lexer.step(); lexer.token = T.t_open_paren; }, ')' => { try lexer.step(); lexer.token = T.t_close_paren; }, '[' => { try lexer.step(); lexer.token = T.t_open_bracket; }, ']' => { try lexer.step(); lexer.token = T.t_close_bracket; }, '{' => { try lexer.step(); lexer.token = T.t_open_brace; }, '}' => { try lexer.step(); lexer.token = T.t_close_brace; }, ',' => { try lexer.step(); lexer.token = T.t_comma; }, ':' => { try lexer.step(); lexer.token = T.t_colon; }, ';' => { try lexer.step(); lexer.token = T.t_semicolon; }, '@' => { try lexer.step(); lexer.token = T.t_at; }, '~' => { try lexer.step(); lexer.token = T.t_tilde; }, '?' => { // '?' or '?.' or '??' or '??=' try lexer.step(); switch (lexer.code_point) { '?' => { try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_question_question_equals; }, else => { lexer.token = T.t_question_question; }, } }, '.' => { lexer.token = T.t_question; const current = lexer.current; const contents = lexer.source.contents; // Lookahead to disambiguate with 'a?.1:b' if (current < contents.len) { const c = contents[current]; if (c < '0' or c > '9') { try lexer.step(); lexer.token = T.t_question_dot; } } }, else => { lexer.token = T.t_question; }, } }, '%' => { // '%' or '%=' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_percent_equals; }, else => { lexer.token = T.t_percent; }, } }, '&' => { // '&' or '&=' or '&&' or '&&=' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_ampersand_equals; }, '&' => { try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_ampersand_ampersand_equals; }, else => { lexer.token = T.t_ampersand_ampersand; }, } }, else => { lexer.token = T.t_ampersand; }, } }, '|' => { // '|' or '|=' or '||' or '||=' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_bar_equals; }, '|' => { try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_bar_bar_equals; }, else => { lexer.token = T.t_bar_bar; }, } }, else => { lexer.token = T.t_bar; }, } }, '^' => { // '^' or '^=' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_caret_equals; }, else => { lexer.token = T.t_caret; }, } }, '+' => { // '+' or '+=' or '++' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_plus_equals; }, '+' => { try lexer.step(); lexer.token = T.t_plus_plus; }, else => { lexer.token = T.t_plus; }, } }, '-' => { // '+' or '+=' or '++' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_minus_equals; }, '-' => { try lexer.step(); if (lexer.code_point == '>' and lexer.has_newline_before) { try lexer.step(); lexer.log.addRangeWarning(lexer.source, lexer.range(), "Treating \"-->\" as the start of a legacy HTML single-line comment") catch unreachable; singleLineHTMLCloseComment: while (true) { switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :singleLineHTMLCloseComment; }, -1 => { break :singleLineHTMLCloseComment; }, else => {}, } try lexer.step(); } continue; } lexer.token = T.t_minus_minus; }, else => { lexer.token = T.t_minus; }, } }, '*' => { // '*' or '*=' or '**' or '**=' try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = .t_asterisk_equals; }, '*' => { try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = .t_asterisk_asterisk_equals; }, else => { lexer.token = .t_asterisk_asterisk; }, } }, else => { lexer.token = .t_asterisk; }, } }, '/' => { // '/' or '/=' or '//' or '/* ... */' try lexer.step(); if (lexer.for_global_name) { lexer.token = .t_slash; break; } switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = .t_slash_equals; }, '/' => { try lexer.step(); singleLineComment: while (true) { try lexer.step(); switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :singleLineComment; }, -1 => { break :singleLineComment; }, else => {}, } } if (lexer.json_options) |json| { if (!json.allow_comments) { try lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); return; } } lexer.scanCommentText(); continue; }, '*' => { try lexer.step(); multiLineComment: while (true) { switch (lexer.code_point) { '*' => { try lexer.step(); if (lexer.code_point == '/') { try lexer.step(); break :multiLineComment; } }, '\r', '\n', 0x2028, 0x2029 => { try lexer.step(); lexer.has_newline_before = true; }, -1 => { lexer.start = lexer.end; lexer.addError(lexer.start, "Expected \"*/\" to terminate multi-line comment", .{}, true); }, else => { try lexer.step(); }, } } if (lexer.json_options) |json| { if (!json.allow_comments) { try lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); return; } } lexer.scanCommentText(); continue; }, else => { lexer.token = .t_slash; }, } }, '=' => { // '=' or '=>' or '==' or '===' try lexer.step(); switch (lexer.code_point) { '>' => { try lexer.step(); lexer.token = T.t_equals_greater_than; }, '=' => { try lexer.step(); switch (lexer.code_point) { '=' => { try lexer.step(); lexer.token = T.t_equals_equals_equals; }, else => { lexer.token = T.t_equals_equals; }, } }, else => { lexer.token = T.t_equals; }, } }, '<' => { // '<' or '<<' or '<=' or '<<=' or '