const std = @import("std"); const logger = @import("root").bun.logger; const tables = @import("js_lexer_tables.zig"); const build_options = @import("build_options"); const js_ast = bun.JSAst; const bun = @import("root").bun; const string = bun.string; const Output = bun.Output; const Global = bun.Global; const Environment = bun.Environment; const strings = bun.strings; const CodePoint = bun.CodePoint; const MutableString = bun.MutableString; const stringZ = bun.stringZ; const default_allocator = bun.default_allocator; const C = bun.C; const FeatureFlags = @import("feature_flags.zig"); const JavascriptString = []const u16; const unicode = std.unicode; const Source = logger.Source; pub const T = tables.T; pub const Keywords = tables.Keywords; pub const tokenToString = tables.tokenToString; pub const StrictModeReservedWords = tables.StrictModeReservedWords; pub const PropertyModifierKeyword = tables.PropertyModifierKeyword; pub const TypescriptStmtKeyword = tables.TypescriptStmtKeyword; pub const TypeScriptAccessibilityModifier = tables.TypeScriptAccessibilityModifier; pub const ChildlessJSXTags = tables.ChildlessJSXTags; fn notimpl() noreturn { Global.panic("not implemented yet!", .{}); } pub var emptyJavaScriptString = ([_]u16{0}); pub const JSXPragma = struct { _jsx: js_ast.Span = .{}, _jsxFrag: js_ast.Span = .{}, _jsxRuntime: js_ast.Span = .{}, _jsxImportSource: js_ast.Span = .{}, pub fn jsx(this: *const JSXPragma) ?js_ast.Span { return if (this._jsx.text.len > 0) this._jsx else null; } pub fn jsxFrag(this: *const JSXPragma) ?js_ast.Span { return if (this._jsxFrag.text.len > 0) this._jsxFrag else null; } pub fn jsxRuntime(this: *const JSXPragma) ?js_ast.Span { return if (this._jsxRuntime.text.len > 0) this._jsxRuntime else null; } pub fn jsxImportSource(this: *const JSXPragma) ?js_ast.Span { return if (this._jsxImportSource.text.len > 0) this._jsxImportSource else null; } }; pub const JSONOptions = struct { /// Enable JSON-specific warnings/errors is_json: bool = false, /// tsconfig.json supports comments & trailing comments allow_comments: bool = false, allow_trailing_commas: bool = false, /// Loading JSON-in-JSON may start like \\""\\" /// This is technically invalid, since we parse from the first value of the string ignore_leading_escape_sequences: bool = false, ignore_trailing_escape_sequences: bool = false, json_warn_duplicate_keys: bool = true, /// mark as originally for a macro to enable inlining was_originally_macro: bool = false, }; pub fn decodeUTF8(bytes: string, allocator: std.mem.Allocator) ![]const u16 { var log = logger.Log.init(allocator); defer log.deinit(); var source = logger.Source.initEmptyFile(""); var lexer = try NewLexer(.{}).init(&log, source, allocator); defer lexer.deinit(); var buf = std.ArrayList(u16).init(allocator); try lexer.decodeEscapeSequences(0, bytes, @TypeOf(buf), &buf); return buf.items; } pub fn NewLexer( comptime json_options: JSONOptions, ) type { return NewLexer_( json_options.is_json, json_options.allow_comments, json_options.allow_trailing_commas, json_options.ignore_leading_escape_sequences, json_options.ignore_trailing_escape_sequences, json_options.json_warn_duplicate_keys, json_options.was_originally_macro, ); } fn NewLexer_( comptime json_options_is_json: bool, comptime json_options_allow_comments: bool, comptime json_options_allow_trailing_commas: bool, comptime json_options_ignore_leading_escape_sequences: bool, comptime json_options_ignore_trailing_escape_sequences: bool, comptime json_options_json_warn_duplicate_keys: bool, comptime json_options_was_originally_macro: bool, ) type { const json_options = JSONOptions{ .is_json = json_options_is_json, .allow_comments = json_options_allow_comments, .allow_trailing_commas = json_options_allow_trailing_commas, .ignore_leading_escape_sequences = json_options_ignore_leading_escape_sequences, .ignore_trailing_escape_sequences = json_options_ignore_trailing_escape_sequences, .json_warn_duplicate_keys = json_options_json_warn_duplicate_keys, .was_originally_macro = json_options_was_originally_macro, }; return struct { const LexerType = @This(); const is_json = json_options.is_json; const json = json_options; const JSONBool = if (is_json) bool else void; const JSONBoolDefault: JSONBool = if (is_json) true else {}; pub const Error = error{ UTF8Fail, OutOfMemory, SyntaxError, UnexpectedSyntax, JSONStringsMustUseDoubleQuotes, ParserError, }; // pub const Error = error{ // UnexpectedToken, // EndOfFile, // }; // err: ?LexerType.Error, log: *logger.Log, source: logger.Source, current: usize = 0, start: usize = 0, end: usize = 0, did_panic: bool = false, approximate_newline_count: usize = 0, previous_backslash_quote_in_jsx: logger.Range = logger.Range.None, token: T = T.t_end_of_file, has_newline_before: bool = false, has_pure_comment_before: bool = false, preserve_all_comments_before: bool = false, is_legacy_octal_literal: bool = false, is_log_disabled: bool = false, comments_to_preserve_before: std.ArrayList(js_ast.G.Comment), code_point: CodePoint = -1, identifier: []const u8 = "", jsx_pragma: JSXPragma = .{}, bun_pragma: bool = false, source_mapping_url: ?js_ast.Span = null, number: f64 = 0.0, rescan_close_brace_as_template_token: bool = false, prev_error_loc: logger.Loc = logger.Loc.Empty, regex_flags_start: ?u16 = null, allocator: std.mem.Allocator, /// In JavaScript, strings are stored as UTF-16, but nearly every string is ascii. /// This means, usually, we can skip UTF8 -> UTF16 conversions. string_literal_buffer: std.ArrayList(u16), string_literal_slice: string = "", string_literal: JavascriptString, string_literal_is_ascii: bool = false, /// Only used for JSON stringification when bundling /// This is a zero-bit type unless we're parsing JSON. is_ascii_only: JSONBool = JSONBoolDefault, track_comments: bool = false, all_comments: std.ArrayList(logger.Range), pub fn clone(self: *const LexerType) LexerType { return LexerType{ .log = self.log, .source = self.source, .current = self.current, .start = self.start, .end = self.end, .did_panic = self.did_panic, .approximate_newline_count = self.approximate_newline_count, .previous_backslash_quote_in_jsx = self.previous_backslash_quote_in_jsx, .token = self.token, .has_newline_before = self.has_newline_before, .has_pure_comment_before = self.has_pure_comment_before, .preserve_all_comments_before = self.preserve_all_comments_before, .is_legacy_octal_literal = self.is_legacy_octal_literal, .is_log_disabled = self.is_log_disabled, .comments_to_preserve_before = self.comments_to_preserve_before, .code_point = self.code_point, .identifier = self.identifier, .regex_flags_start = self.regex_flags_start, .jsx_pragma = self.jsx_pragma, .source_mapping_url = self.source_mapping_url, .number = self.number, .rescan_close_brace_as_template_token = self.rescan_close_brace_as_template_token, .prev_error_loc = self.prev_error_loc, .allocator = self.allocator, .string_literal_buffer = self.string_literal_buffer, .string_literal_slice = self.string_literal_slice, .string_literal = self.string_literal, .string_literal_is_ascii = self.string_literal_is_ascii, .is_ascii_only = self.is_ascii_only, .all_comments = self.all_comments, }; } pub inline fn loc(self: *const LexerType) logger.Loc { return logger.usize2Loc(self.start); } pub fn syntaxError(self: *LexerType) !void { @setCold(true); self.addError(self.start, "Syntax Error!!", .{}, true); return Error.SyntaxError; } pub fn addDefaultError(self: *LexerType, msg: []const u8) !void { @setCold(true); self.addError(self.start, "{s}", .{msg}, true); return Error.SyntaxError; } pub fn addSyntaxError(self: *LexerType, _loc: usize, comptime fmt: []const u8, args: anytype) !void { @setCold(true); self.addError(_loc, fmt, args, false); return Error.SyntaxError; } pub fn addError(self: *LexerType, _loc: usize, comptime format: []const u8, args: anytype, _: bool) void { @setCold(true); if (self.is_log_disabled) return; var __loc = logger.usize2Loc(_loc); if (__loc.eql(self.prev_error_loc)) { return; } self.log.addErrorFmt(&self.source, __loc, self.allocator, format, args) catch unreachable; self.prev_error_loc = __loc; } pub fn addRangeError(self: *LexerType, r: logger.Range, comptime format: []const u8, args: anytype, _: bool) !void { @setCold(true); if (self.is_log_disabled) return; if (self.prev_error_loc.eql(r.loc)) { return; } const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; try self.log.addRangeError(&self.source, r, errorMessage); self.prev_error_loc = r.loc; // if (panic) { // return Error.ParserError; // } } /// Look ahead at the next n codepoints without advancing the iterator. /// If fewer than n codepoints are available, then return the remainder of the string. fn peek(it: *LexerType, n: usize) string { const original_i = it.current; defer it.current = original_i; var end_ix = original_i; var found: usize = 0; while (found < n) : (found += 1) { const next_codepoint = it.nextCodepointSlice(); if (next_codepoint.len == 0) break; end_ix += next_codepoint.len; } return it.source.contents[original_i..end_ix]; } pub inline fn isIdentifierOrKeyword(lexer: LexerType) bool { return @intFromEnum(lexer.token) >= @intFromEnum(T.t_identifier); } pub fn deinit(this: *LexerType) void { this.all_comments.clearAndFree(); this.comments_to_preserve_before.clearAndFree(); } fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { var buf = buf_.*; defer buf_.* = buf; if (comptime is_json) lexer.is_ascii_only = false; const iterator = strings.CodepointIterator{ .bytes = text, .i = 0 }; var iter = strings.CodepointIterator.Cursor{}; while (iterator.next(&iter)) { const width = iter.width; switch (iter.c) { '\r' => { // From the specification: // // 11.8.6.1 Static Semantics: TV and TRV // // TV excludes the code units of LineContinuation while TRV includes // them. and LineTerminatorSequences are normalized to // for both TV and TRV. An explicit EscapeSequence is needed to // include a or sequence. // Convert '\r\n' into '\n' const next_i: usize = iter.i + 1; iter.i += @as(u32, @intFromBool(next_i < text.len and text[next_i] == '\n')); // Convert '\r' into '\n' buf.append('\n') catch unreachable; continue; }, '\\' => { _ = iterator.next(&iter) or return; const c2 = iter.c; const width2 = iter.width; switch (c2) { // https://mathiasbynens.be/notes/javascript-escapes#single 'b' => { buf.append(0x08) catch unreachable; continue; }, 'f' => { buf.append(0x0C) catch unreachable; continue; }, 'n' => { buf.append(0x0A) catch unreachable; continue; }, 'v' => { // Vertical tab is invalid JSON // We're going to allow it. // if (comptime is_json) { // lexer.end = start + iter.i - width2; // try lexer.syntaxError(); // } buf.append(0x0B) catch unreachable; continue; }, 't' => { buf.append(0x09) catch unreachable; continue; }, 'r' => { buf.append(0x0D) catch unreachable; continue; }, // legacy octal literals '0'...'7' => { const octal_start = (iter.i + width2) - 2; if (comptime is_json) { lexer.end = start + iter.i - width2; try lexer.syntaxError(); } // 1-3 digit octal var is_bad = false; var value: i64 = c2 - '0'; var restore = iter; _ = iterator.next(&iter) or { if (value == 0) { try buf.append(0); return; } try lexer.syntaxError(); return; }; const c3: CodePoint = iter.c; switch (c3) { '0'...'7' => { value = value * 8 + c3 - '0'; restore = iter; _ = iterator.next(&iter) or return lexer.syntaxError(); const c4 = iter.c; switch (c4) { '0'...'7' => { const temp = value * 8 + c4 - '0'; if (temp < 256) { value = temp; } else { iter = restore; } }, '8', '9' => { is_bad = true; }, else => { iter = restore; }, } }, '8', '9' => { is_bad = true; }, else => { iter = restore; }, } iter.c = @as(i32, @intCast(value)); if (is_bad) { lexer.addRangeError( logger.Range{ .loc = .{ .start = @as(i32, @intCast(octal_start)) }, .len = @as(i32, @intCast(iter.i - octal_start)) }, "Invalid legacy octal literal", .{}, false, ) catch unreachable; } }, '8', '9' => { iter.c = c2; }, // 2-digit hexadecimal 'x' => { var value: CodePoint = 0; var c3: CodePoint = 0; var width3: u3 = 0; _ = iterator.next(&iter) or return lexer.syntaxError(); c3 = iter.c; width3 = iter.width; switch (c3) { '0'...'9' => { value = value * 16 | (c3 - '0'); }, 'a'...'f' => { value = value * 16 | (c3 + 10 - 'a'); }, 'A'...'F' => { value = value * 16 | (c3 + 10 - 'A'); }, else => { lexer.end = start + iter.i - width3; return lexer.syntaxError(); }, } _ = iterator.next(&iter) or return lexer.syntaxError(); c3 = iter.c; width3 = iter.width; switch (c3) { '0'...'9' => { value = value * 16 | (c3 - '0'); }, 'a'...'f' => { value = value * 16 | (c3 + 10 - 'a'); }, 'A'...'F' => { value = value * 16 | (c3 + 10 - 'A'); }, else => { lexer.end = start + iter.i - width3; return lexer.syntaxError(); }, } iter.c = value; }, 'u' => { // We're going to make this an i64 so we don't risk integer overflows // when people do weird things var value: i64 = 0; _ = iterator.next(&iter) or return lexer.syntaxError(); var c3 = iter.c; var width3 = iter.width; // variable-length if (c3 == '{') { if (comptime is_json) { lexer.end = start + iter.i - width2; try lexer.syntaxError(); } const hex_start = (iter.i + start) - width - width2 - width3; var is_first = true; var is_out_of_range = false; variableLength: while (true) { _ = iterator.next(&iter) or break :variableLength; c3 = iter.c; switch (c3) { '0'...'9' => { value = value * 16 | (c3 - '0'); }, 'a'...'f' => { value = value * 16 | (c3 + 10 - 'a'); }, 'A'...'F' => { value = value * 16 | (c3 + 10 - 'A'); }, '}' => { if (is_first) { lexer.end = (start + iter.i) -| width3; return lexer.syntaxError(); } break :variableLength; }, else => { lexer.end = (start + iter.i) -| width3; return lexer.syntaxError(); }, } // '\U0010FFFF // copied from golang utf8.MaxRune if (value > 1114111) { is_out_of_range = true; } is_first = false; } if (is_out_of_range) { try lexer.addRangeError( .{ .loc = .{ .start = @as(i32, @intCast(start + hex_start)) }, .len = @as(i32, @intCast(((iter.i + start) - hex_start))) }, "Unicode escape sequence is out of range", .{}, true, ); return; } // fixed-length } else { // Fixed-length // comptime var j: usize = 0; var j: usize = 0; while (j < 4) : (j += 1) { switch (c3) { '0'...'9' => { value = value * 16 | (c3 - '0'); }, 'a'...'f' => { value = value * 16 | (c3 + 10 - 'a'); }, 'A'...'F' => { value = value * 16 | (c3 + 10 - 'A'); }, else => { lexer.end = start + iter.i - width3; return lexer.syntaxError(); }, } if (j < 3) { _ = iterator.next(&iter) or return lexer.syntaxError(); c3 = iter.c; width3 = iter.width; } } } iter.c = @as(CodePoint, @truncate(value)); }, '\r' => { if (comptime is_json) { lexer.end = start + iter.i - width2; try lexer.syntaxError(); } // Make sure Windows CRLF counts as a single newline const next_i: usize = iter.i + 1; iter.i += @as(u32, @intFromBool(next_i < text.len and text[next_i] == '\n')); // Ignore line continuations. A line continuation is not an escaped newline. continue; }, '\n', 0x2028, 0x2029 => { if (comptime is_json) { lexer.end = start + iter.i - width2; try lexer.syntaxError(); } // Ignore line continuations. A line continuation is not an escaped newline. continue; }, else => { if (comptime is_json) { switch (c2) { '"', '\\', '/' => {}, else => { lexer.end = start + iter.i - width2; try lexer.syntaxError(); }, } } iter.c = c2; }, } }, else => {}, } switch (iter.c) { -1 => return try lexer.addDefaultError("Unexpected end of file"), 0...0xFFFF => { buf.append(@as(u16, @intCast(iter.c))) catch unreachable; }, else => { iter.c -= 0x10000; buf.ensureUnusedCapacity(2) catch unreachable; buf.appendAssumeCapacity(@as(u16, @intCast(0xD800 + ((iter.c >> 10) & 0x3FF)))); buf.appendAssumeCapacity(@as(u16, @intCast(0xDC00 + (iter.c & 0x3FF)))); }, } } } pub const InnerStringLiteral = packed struct { suffix_len: u3, needs_slow_path: bool }; fn parseStringLiteralInnter(lexer: *LexerType, comptime quote: CodePoint) !InnerStringLiteral { var needs_slow_path = false; var suffix_len: u3 = if (comptime quote == 0) 0 else 1; stringLiteral: while (true) { switch (lexer.code_point) { '\\' => { lexer.step(); // Handle Windows CRLF if (lexer.code_point == '\r' and comptime !is_json) { lexer.step(); if (lexer.code_point == '\n') { lexer.step(); } continue :stringLiteral; } if (comptime is_json and json_options.ignore_trailing_escape_sequences) { if (lexer.code_point == quote and lexer.current >= lexer.source.contents.len) { lexer.step(); break; } } switch (lexer.code_point) { // 0 cannot be in this list because it may be a legacy octal literal 'v', 'f', 't', 'r', 'n', '`', '\'', '"', '\\', 0x2028, 0x2029 => { lexer.step(); continue :stringLiteral; }, else => { needs_slow_path = true; }, } }, // This indicates the end of the file -1 => { if (comptime quote != 0) { try lexer.addDefaultError("Unterminated string literal"); } break :stringLiteral; }, '\r' => { if (comptime quote != '`') { try lexer.addDefaultError("Unterminated string literal"); } // Template literals require newline normalization needs_slow_path = true; }, '\n' => { // Implicitly-quoted strings end when they reach a newline OR end of file // This only applies to .env switch (comptime quote) { 0 => { break :stringLiteral; }, '`' => {}, else => { try lexer.addDefaultError("Unterminated string literal"); }, } }, '$' => { if (comptime quote == '`') { lexer.step(); if (lexer.code_point == '{') { suffix_len = 2; lexer.step(); lexer.token = if (lexer.rescan_close_brace_as_template_token) T.t_template_middle else T.t_template_head; break :stringLiteral; } continue :stringLiteral; } }, // exit condition quote => { lexer.step(); break; }, else => { // Non-ASCII strings need the slow path if (lexer.code_point >= 0x80) { needs_slow_path = true; } else if (is_json and lexer.code_point < 0x20) { try lexer.syntaxError(); } else if (comptime (quote == '"' or quote == '\'') and Environment.isNative) { const remainder = lexer.source.contents[lexer.current..]; if (remainder.len >= 4096) { lexer.current += indexOfInterestingCharacterInStringLiteral(remainder, quote) orelse { lexer.step(); continue; }; lexer.end = lexer.current -| 1; lexer.step(); continue; } } }, } lexer.step(); } return InnerStringLiteral{ .needs_slow_path = needs_slow_path, .suffix_len = suffix_len }; } pub fn parseStringLiteral(lexer: *LexerType, comptime quote: CodePoint) !void { if (comptime quote != '`') { lexer.token = T.t_string_literal; } else if (lexer.rescan_close_brace_as_template_token) { lexer.token = T.t_template_tail; } else { lexer.token = T.t_no_substitution_template_literal; } // quote is 0 when parsing JSON from .env // .env values may not always be quoted. lexer.step(); const string_literal_details = try lexer.parseStringLiteralInnter(quote); // Reset string literal const base = if (comptime quote == 0) lexer.start else lexer.start + 1; lexer.string_literal_slice = lexer.source.contents[base..@min(lexer.source.contents.len, lexer.end - @as(usize, string_literal_details.suffix_len))]; lexer.string_literal_is_ascii = !string_literal_details.needs_slow_path; lexer.string_literal_buffer.shrinkRetainingCapacity(0); if (string_literal_details.needs_slow_path) { lexer.string_literal_buffer.ensureUnusedCapacity(lexer.string_literal_slice.len) catch unreachable; try lexer.decodeEscapeSequences(lexer.start, lexer.string_literal_slice, @TypeOf(lexer.string_literal_buffer), &lexer.string_literal_buffer); lexer.string_literal = lexer.string_literal_buffer.items; } if (comptime is_json) lexer.is_ascii_only = lexer.is_ascii_only and lexer.string_literal_is_ascii; if (comptime !FeatureFlags.allow_json_single_quotes) { if (quote == '\'' and is_json) { try lexer.addRangeError(lexer.range(), "JSON strings must use double quotes", .{}, true); } } // for (text) // // if (needs_slow_path) { // // // Slow path // // // lexer.string_literal = lexer.(lexer.start + 1, text); // // } else { // // // Fast path // // } } inline fn nextCodepointSlice(it: *LexerType) []const u8 { const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(it.source.contents.ptr[it.current]); return if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else ""; } inline fn nextCodepoint(it: *LexerType) CodePoint { const cp_len = strings.wtf8ByteSequenceLengthWithInvalid(it.source.contents.ptr[it.current]); const slice = if (!(cp_len + it.current > it.source.contents.len)) it.source.contents[it.current .. cp_len + it.current] else ""; const code_point = switch (slice.len) { 0 => -1, 1 => @as(CodePoint, slice[0]), else => strings.decodeWTF8RuneTMultibyte(slice.ptr[0..4], @as(u3, @intCast(slice.len)), CodePoint, strings.unicode_replacement), }; it.end = it.current; it.current += if (code_point != strings.unicode_replacement) cp_len else 1; return code_point; } fn step(lexer: *LexerType) void { lexer.code_point = lexer.nextCodepoint(); // Track the approximate number of newlines in the file so we can preallocate // the line offset table in the printer for source maps. The line offset table // is the #1 highest allocation in the heap profile, so this is worth doing. // This count is approximate because it handles "\n" and "\r\n" (the common // cases) but not "\r" or "\u2028" or "\u2029". Getting this wrong is harmless // because it's only a preallocation. The array will just grow if it's too small. lexer.approximate_newline_count += @intFromBool(lexer.code_point == '\n'); } pub inline fn expect(self: *LexerType, comptime token: T) !void { if (self.token != token) { try self.expected(token); } try self.next(); } pub inline fn expectOrInsertSemicolon(lexer: *LexerType) !void { if (lexer.token == T.t_semicolon or (!lexer.has_newline_before and lexer.token != T.t_close_brace and lexer.token != T.t_end_of_file)) { try lexer.expect(T.t_semicolon); } } pub fn addUnsupportedSyntaxError(self: *LexerType, msg: []const u8) !void { self.addError(self.end, "Unsupported syntax: {s}", .{msg}, true); return Error.SyntaxError; } pub const IdentifierKind = enum { normal, private }; pub const ScanResult = struct { token: T, contents: string }; threadlocal var small_escape_sequence_buffer: [4096]u16 = undefined; const FakeArrayList16 = struct { items: []u16, i: usize = 0, pub fn append(fake: *FakeArrayList16, value: u16) !void { std.debug.assert(fake.items.len > fake.i); fake.items[fake.i] = value; fake.i += 1; } pub fn appendAssumeCapacity(fake: *FakeArrayList16, value: u16) void { std.debug.assert(fake.items.len > fake.i); fake.items[fake.i] = value; fake.i += 1; } pub fn ensureUnusedCapacity(fake: *FakeArrayList16, int: anytype) !void { std.debug.assert(fake.items.len > fake.i + int); } }; threadlocal var large_escape_sequence_list: std.ArrayList(u16) = undefined; threadlocal var large_escape_sequence_list_loaded: bool = false; // This is an edge case that doesn't really exist in the wild, so it doesn't // need to be as fast as possible. pub fn scanIdentifierWithEscapes(lexer: *LexerType, kind: IdentifierKind) anyerror!ScanResult { var result = ScanResult{ .token = .t_end_of_file, .contents = "" }; // First pass: scan over the identifier to see how long it is while (true) { // Scan a unicode escape sequence. There is at least one because that's // what caused us to get on this slow path in the first place. if (lexer.code_point == '\\') { lexer.step(); if (lexer.code_point != 'u') { try lexer.syntaxError(); } lexer.step(); if (lexer.code_point == '{') { // Variable-length lexer.step(); while (lexer.code_point != '}') { switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { lexer.step(); }, else => try lexer.syntaxError(), } } lexer.step(); } else { // Fixed-length // comptime var j: usize = 0; switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { lexer.step(); }, else => try lexer.syntaxError(), } switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { lexer.step(); }, else => try lexer.syntaxError(), } switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { lexer.step(); }, else => try lexer.syntaxError(), } switch (lexer.code_point) { '0'...'9', 'a'...'f', 'A'...'F' => { lexer.step(); }, else => try lexer.syntaxError(), } } continue; } if (!isIdentifierContinue(lexer.code_point)) { break; } lexer.step(); } // Second pass: re-use our existing escape sequence parser const original_text = lexer.raw(); if (original_text.len < 1024) { var buf = FakeArrayList16{ .items = &small_escape_sequence_buffer, .i = 0 }; try lexer.decodeEscapeSequences(lexer.start, original_text, FakeArrayList16, &buf); result.contents = lexer.utf16ToString(buf.items[0..buf.i]); } else { if (!large_escape_sequence_list_loaded) { large_escape_sequence_list = try std.ArrayList(u16).initCapacity(lexer.allocator, original_text.len); large_escape_sequence_list_loaded = true; } large_escape_sequence_list.shrinkRetainingCapacity(0); try lexer.decodeEscapeSequences(lexer.start, original_text, std.ArrayList(u16), &large_escape_sequence_list); result.contents = lexer.utf16ToString(large_escape_sequence_list.items); } const identifier = if (kind != .private) result.contents else result.contents[1..]; if (!isIdentifier(identifier)) { try lexer.addRangeError( .{ .loc = logger.usize2Loc(lexer.start), .len = @as(i32, @intCast(lexer.end - lexer.start)) }, "Invalid identifier: \"{s}\"", .{result.contents}, true, ); } result.contents = result.contents; // Escaped keywords are not allowed to work as actual keywords, but they are // allowed wherever we allow identifiers or keywords. For example: // // // This is an error (equivalent to "var var;") // var \u0076\u0061\u0072; // // // This is an error (equivalent to "var foo;" except for this rule) // \u0076\u0061\u0072 foo; // // // This is an fine (equivalent to "foo.var;") // foo.\u0076\u0061\u0072; // result.token = if (Keywords.has(result.contents)) .t_escaped_keyword else .t_identifier; // const text = lexer.decodeEscapeSequences(lexer.start, lexer.raw(), ) return result; } pub fn expectContextualKeyword(self: *LexerType, comptime keyword: string) !void { if (!self.isContextualKeyword(keyword)) { if (@import("builtin").mode == std.builtin.Mode.Debug) { self.addError(self.start, "Expected \"{s}\" but found \"{s}\" (token: {s})", .{ keyword, self.raw(), @tagName(self.token), }, true); } else { self.addError(self.start, "Expected \"{s}\" but found \"{s}\"", .{ keyword, self.raw() }, true); } return Error.UnexpectedSyntax; } try self.next(); } pub fn maybeExpandEquals(lexer: *LexerType) !void { switch (lexer.code_point) { '>' => { // "=" + ">" = "=>" lexer.token = .t_equals_greater_than; lexer.step(); }, '=' => { // "=" + "=" = "==" lexer.token = .t_equals_equals; lexer.step(); if (lexer.code_point == '=') { // "=" + "==" = "===" lexer.token = .t_equals_equals_equals; lexer.step(); } }, else => {}, } } pub fn expectLessThan(lexer: *LexerType, comptime is_inside_jsx_element: bool) !void { switch (lexer.token) { .t_less_than => { if (is_inside_jsx_element) { try lexer.nextInsideJSXElement(); } else { try lexer.next(); } }, .t_less_than_equals => { lexer.token = .t_equals; lexer.start += 1; try lexer.maybeExpandEquals(); }, .t_less_than_less_than => { lexer.token = .t_less_than; lexer.start += 1; }, .t_less_than_less_than_equals => { lexer.token = .t_less_than_equals; lexer.start += 1; }, else => { try lexer.expected(.t_less_than); }, } } pub fn expectGreaterThan(lexer: *LexerType, comptime is_inside_jsx_element: bool) !void { switch (lexer.token) { .t_greater_than => { if (is_inside_jsx_element) { try lexer.nextInsideJSXElement(); } else { try lexer.next(); } }, .t_greater_than_equals => { lexer.token = .t_equals; lexer.start += 1; try lexer.maybeExpandEquals(); }, .t_greater_than_greater_than_equals => { lexer.token = .t_greater_than_equals; lexer.start += 1; }, .t_greater_than_greater_than_greater_than_equals => { lexer.token = .t_greater_than_greater_than_equals; lexer.start += 1; }, .t_greater_than_greater_than => { lexer.token = .t_greater_than; lexer.start += 1; }, .t_greater_than_greater_than_greater_than => { lexer.token = .t_greater_than_greater_than; lexer.start += 1; }, else => { try lexer.expected(.t_greater_than); }, } } pub fn next(lexer: *LexerType) !void { lexer.has_newline_before = lexer.end == 0; lexer.has_pure_comment_before = false; while (true) { lexer.start = lexer.end; lexer.token = T.t_end_of_file; switch (lexer.code_point) { -1 => { lexer.token = T.t_end_of_file; }, '#' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Private identifiers are not allowed in JSON"); } if (lexer.start == 0 and lexer.source.contents[1] == '!') { // "#!/usr/bin/env node" lexer.token = .t_hashbang; hashbang: while (true) { lexer.step(); switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :hashbang; }, -1 => { break :hashbang; }, else => {}, } } lexer.identifier = lexer.raw(); } else { // "#foo" lexer.step(); if (lexer.code_point == '\\') { lexer.identifier = (try lexer.scanIdentifierWithEscapes(.private)).contents; } else { if (!isIdentifierStart(lexer.code_point)) { try lexer.syntaxError(); } lexer.step(); while (isIdentifierContinue(lexer.code_point)) { lexer.step(); } if (lexer.code_point == '\\') { lexer.identifier = (try lexer.scanIdentifierWithEscapes(.private)).contents; } else { lexer.identifier = lexer.raw(); } } lexer.token = T.t_private_identifier; break; } }, '\r', '\n', 0x2028, 0x2029 => { lexer.step(); lexer.has_newline_before = true; continue; }, '\t', ' ' => { lexer.step(); continue; }, '(' => { lexer.step(); lexer.token = T.t_open_paren; }, ')' => { lexer.step(); lexer.token = T.t_close_paren; }, '[' => { lexer.step(); lexer.token = T.t_open_bracket; }, ']' => { lexer.step(); lexer.token = T.t_close_bracket; }, '{' => { lexer.step(); lexer.token = T.t_open_brace; }, '}' => { lexer.step(); lexer.token = T.t_close_brace; }, ',' => { lexer.step(); lexer.token = T.t_comma; }, ':' => { lexer.step(); lexer.token = T.t_colon; }, ';' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Semicolons are not allowed in JSON"); } lexer.step(); lexer.token = T.t_semicolon; }, '@' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Decorators are not allowed in JSON"); } lexer.step(); lexer.token = T.t_at; }, '~' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("~ is not allowed in JSON"); } lexer.step(); lexer.token = T.t_tilde; }, '?' => { // '?' or '?.' or '??' or '??=' lexer.step(); switch (lexer.code_point) { '?' => { lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_question_question_equals; }, else => { lexer.token = T.t_question_question; }, } }, '.' => { lexer.token = T.t_question; const current = lexer.current; const contents = lexer.source.contents; // Lookahead to disambiguate with 'a?.1:b' if (current < contents.len) { const c = contents[current]; if (c < '0' or c > '9') { lexer.step(); lexer.token = T.t_question_dot; } } }, else => { lexer.token = T.t_question; }, } }, '%' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '%' or '%=' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_percent_equals; }, else => { lexer.token = T.t_percent; }, } }, '&' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '&' or '&=' or '&&' or '&&=' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_ampersand_equals; }, '&' => { lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_ampersand_ampersand_equals; }, else => { lexer.token = T.t_ampersand_ampersand; }, } }, else => { lexer.token = T.t_ampersand; }, } }, '|' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '|' or '|=' or '||' or '||=' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_bar_equals; }, '|' => { lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_bar_bar_equals; }, else => { lexer.token = T.t_bar_bar; }, } }, else => { lexer.token = T.t_bar; }, } }, '^' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '^' or '^=' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_caret_equals; }, else => { lexer.token = T.t_caret; }, } }, '+' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '+' or '+=' or '++' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_plus_equals; }, '+' => { lexer.step(); lexer.token = T.t_plus_plus; }, else => { lexer.token = T.t_plus; }, } }, '-' => { // '+' or '+=' or '++' lexer.step(); switch (lexer.code_point) { '=' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } lexer.step(); lexer.token = T.t_minus_equals; }, '-' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } lexer.step(); if (lexer.code_point == '>' and lexer.has_newline_before) { lexer.step(); lexer.log.addRangeWarning(&lexer.source, lexer.range(), "Treating \"-->\" as the start of a legacy HTML single-line comment") catch unreachable; singleLineHTMLCloseComment: while (true) { switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :singleLineHTMLCloseComment; }, -1 => { break :singleLineHTMLCloseComment; }, else => {}, } lexer.step(); } continue; } lexer.token = T.t_minus_minus; }, else => { lexer.token = T.t_minus; }, } }, '*' => { // '*' or '*=' or '**' or '**=' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = .t_asterisk_equals; }, '*' => { lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = .t_asterisk_asterisk_equals; }, else => { lexer.token = .t_asterisk_asterisk; }, } }, else => { lexer.token = .t_asterisk; }, } }, '/' => { // '/' or '/=' or '//' or '/* ... */' lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = .t_slash_equals; }, '/' => { singleLineComment: while (true) { lexer.step(); switch (lexer.code_point) { '\r', '\n', 0x2028, 0x2029 => { break :singleLineComment; }, -1 => { break :singleLineComment; }, else => {}, } } if (comptime is_json) { if (!json.allow_comments) { try lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); return; } } lexer.scanCommentText(); continue; }, '*' => { lexer.step(); multiLineComment: while (true) { switch (lexer.code_point) { '*' => { lexer.step(); if (lexer.code_point == '/') { lexer.step(); break :multiLineComment; } }, '\r', '\n', 0x2028, 0x2029 => { lexer.step(); lexer.has_newline_before = true; }, -1 => { lexer.start = lexer.end; try lexer.addSyntaxError( lexer.start, "Expected \"*/\" to terminate multi-line comment", .{}, ); }, else => { // if (comptime Environment.enableSIMD) { // TODO: this seems to work, but we shouldn't enable this until after improving test coverage // if (lexer.code_point < 128) { // const remainder = lexer.source.contents[lexer.current..]; // if (remainder.len >= 4096) { // lexer.current += skipToInterestingCharacterInMultilineComment(remainder) orelse { // lexer.step(); // continue; // }; // lexer.end = lexer.current -| 1; // lexer.step(); // continue; // } // } // } lexer.step(); }, } } if (comptime is_json) { if (!json.allow_comments) { try lexer.addRangeError(lexer.range(), "JSON does not support comments", .{}, true); return; } } lexer.scanCommentText(); continue; }, else => { lexer.token = .t_slash; }, } }, '=' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '=' or '=>' or '==' or '===' lexer.step(); switch (lexer.code_point) { '>' => { lexer.step(); lexer.token = T.t_equals_greater_than; }, '=' => { lexer.step(); switch (lexer.code_point) { '=' => { lexer.step(); lexer.token = T.t_equals_equals_equals; }, else => { lexer.token = T.t_equals_equals; }, } }, else => { lexer.token = T.t_equals; }, } }, '<' => { if (comptime is_json) { return lexer.addUnsupportedSyntaxError("Operators are not allowed in JSON"); } // '<' or '<<' or '<=' or '<<=' or '