diff options
author | 2021-04-18 23:21:37 -0700 | |
---|---|---|
committer | 2021-04-18 23:21:37 -0700 | |
commit | bbfe52d626e57b0228cf86183afb66a83cce14be (patch) | |
tree | 650480f60e0c7baf093c2e811d24d92f92862713 | |
parent | 422e2e8329990a313a150900b5527b47be89d20c (diff) | |
download | bun-bbfe52d626e57b0228cf86183afb66a83cce14be.tar.gz bun-bbfe52d626e57b0228cf86183afb66a83cce14be.tar.zst bun-bbfe52d626e57b0228cf86183afb66a83cce14be.zip |
cool beans
-rw-r--r-- | src/js_lexer.zig | 601 | ||||
-rw-r--r-- | src/logger.zig | 6 |
2 files changed, 496 insertions, 111 deletions
diff --git a/src/js_lexer.zig b/src/js_lexer.zig index 878d81df1..4019c8974 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -1,6 +1,11 @@ const std = @import("std"); const logger = @import("logger.zig"); const tables = @import("js_lexer_tables.zig"); +const alloc = @import("alloc.zig"); +const build_options = @import("build_options"); + +const _f = @import("./test/fixtures.zig"); + const unicode = std.unicode; const Source = logger.Source; @@ -38,7 +43,7 @@ pub const Lexer = struct { // comments_to_preserve_before: []js_ast.Comment, // all_original_comments: []js_ast.Comment, code_point: CodePoint = -1, - string_literal: std.ArrayList([]u16), + string_literal: []u16, identifier: []const u8 = "", // jsx_factory_pragma_comment: js_ast.Span, // jsx_fragment_pragma_comment: js_ast.Span, @@ -47,6 +52,8 @@ pub const Lexer = struct { rescan_close_brace_as_template_token: bool = false, for_global_name: bool = false, prev_error_loc: i32 = -1, + allocator: *std.mem.Allocator, + fn nextCodepointSlice(it: *Lexer) callconv(.Inline) ?[]const u8 { if (it.current >= it.source.contents.len) { return null; @@ -59,23 +66,27 @@ pub const Lexer = struct { return it.source.contents[it.current - cp_len .. it.current]; } - pub fn syntax_error(self: *Lexer) void { + pub fn syntaxError(self: *Lexer) void { self.addError(self.start, "Syntax Error!!", .{}, true); } + pub fn addDefaultError(self: *Lexer, msg: []const u8) void { + self.addError(self.start, "{s}", .{msg}, true); + } + pub fn addError(self: *Lexer, _loc: usize, comptime format: []const u8, args: anytype, panic: bool) void { const loc = logger.usize2Loc(_loc); if (loc == self.prev_error_loc) { return; } - const errorMessage = std.fmt.allocPrint(self.string_literal.allocator, format, args) catch unreachable; + const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; self.log.addError(self.source, loc, errorMessage) catch unreachable; self.prev_error_loc = loc; - if (panic) { - self.doPanic(errorMessage); - } + // if (panic) { + self.doPanic(errorMessage); + // } } pub fn addRangeError(self: *Lexer, range: logger.Range, comptime format: []const u8, args: anytype, panic: bool) void { @@ -83,7 +94,7 @@ pub const Lexer = struct { return; } - const errorMessage = std.fmt.allocPrint(self.string_literal.allocator, format, args) catch unreachable; + const errorMessage = std.fmt.allocPrint(self.allocator, format, args) catch unreachable; var msg = self.log.addRangeError(self.source, range, errorMessage); self.prev_error_loc = loc; @@ -128,6 +139,108 @@ pub const Lexer = struct { return it.source.contents[original_i..end_ix]; } + fn parseStringLiteral(lexer: *Lexer) void { + var quote: CodePoint = lexer.code_point; + var needs_slow_path = false; + var suffixLen: usize = 1; + + if (quote != '`') { + lexer.token = T.t_string_literal; + } else if (lexer.rescan_close_brace_as_template_token) { + lexer.token = T.t_template_tail; + } else { + lexer.token = T.t_no_substitution_template_literal; + } + lexer.step(); + + stringLiteral: while (true) { + switch (lexer.code_point) { + '\\' => { + needs_slow_path = true; + lexer.step(); + + // Handle Windows CRLF + if (lexer.code_point == '\r' and IS_JSON_FILE) { + lexer.step(); + if (lexer.code_point == '\n') { + lexer.step(); + } + continue :stringLiteral; + } + }, + // This indicates the end of the file + + -1 => { + lexer.addDefaultError("Unterminated string literal"); + }, + + '\r' => { + if (quote != '`') { + lexer.addDefaultError("Unterminated string literal"); + } + + // Template literals require newline normalization + needs_slow_path = true; + }, + + '\n' => { + if (quote != '`') { + lexer.addDefaultError("Unterminated string literal"); + } + }, + + '$' => { + if (quote == '`') { + lexer.step(); + if (lexer.code_point == '{') { + suffixLen = 2; + lexer.step(); + if (lexer.rescan_close_brace_as_template_token) { + lexer.token = T.t_template_middle; + } else { + lexer.token = T.t_template_head; + } + break :stringLiteral; + } + continue :stringLiteral; + } + }, + + else => { + if (quote == lexer.code_point) { + lexer.step(); + break :stringLiteral; + } + // Non-ASCII strings need the slow path + if (lexer.code_point >= 0x80) { + needs_slow_path = true; + } else if (IS_JSON_FILE and lexer.code_point < 0x20) { + lexer.syntaxError(); + } + }, + } + lexer.step(); + } + + const text = lexer.source.contents[lexer.start + 1 .. lexer.end - suffixLen]; + // TODO: actually implement proper utf16 + lexer.string_literal = lexer.allocator.alloc(u16, text.len) catch unreachable; + var i: usize = 0; + for (text) |byte| { + lexer.string_literal[i] = byte; + i += 1; + } + // for (text) + // // if (needs_slow_path) { + // // // Slow path + + // // // lexer.string_literal = lexer.(lexer.start + 1, text); + // // } else { + // // // Fast path + + // // } + } + fn step(lexer: *Lexer) void { lexer.code_point = lexer.nextCodepoint(); @@ -167,16 +280,32 @@ pub const Lexer = struct { return; } + pub fn debugInfo(self: *Lexer) void { + if (self.log.errors > 0) { + const stderr = std.io.getStdErr().writer(); + self.log.print(stderr) catch unreachable; + } else { + if (self.token == T.t_identifier or self.token == T.t_string_literal) { + std.debug.print(" {s} ", .{self.raw()}); + } else { + std.debug.print(" <{s}> ", .{tokenToString.get(self.token)}); + } + } + } + pub fn next(lexer: *Lexer) void { lexer.has_newline_before = lexer.end == 0; - while (true) { + lex: while (lexer.log.errors == 0) { + lexer.debugInfo(); + lexer.start = lexer.end; lexer.token = T.t_end_of_file; switch (lexer.code_point) { -1 => { lexer.token = T.t_end_of_file; + break :lex; }, '#' => { @@ -187,7 +316,7 @@ pub const Lexer = struct { lexer.step(); if (!isIdentifierStart(lexer.code_point)) { - lexer.syntax_error(); + lexer.syntaxError(); } lexer.step(); @@ -536,98 +665,7 @@ pub const Lexer = struct { }, '\'', '"', '`' => { - const quote = lexer.code_point; - var needsSlowPath = false; - var suffixLen = 1; - - if (quote != '`') { - lexer.token = T.t_string_literal; - } else if (lexer.rescan_close_brace_as_template_token) { - lexer.token = T.t_template_tail; - } else { - lexer.token = T.t_no_substitution_template_literal; - } - lexer.step(); - - stringLiteral: while (true) { - switch (lexer.codePoint) { - '\\' => { - needs_slow_path = true; - lexer.step(); - - // Handle Windows CRLF - if (lexer.code_point == '\r' and IS_JSON_FILE) { - lexer.step(); - if (lexer.code_point == '\n') { - lexer.step(); - } - continue; - } - }, - // This indicates the end of the file - - -1 => { - lexer.addError("Unterminated string literal"); - }, - - '\r' => { - if (quote != '`') { - lexer.addError("Unterminated string literal"); - } - - // Template literals require newline normalization - needsSlowPath = true; - }, - - '\n' => { - if (quote != '`') { - lexer.addError("Unterminated string literal"); - } - }, - - '$' => { - if (quote == '`') { - lexer.step(); - if (lexer.codePoint == '{') { - suffixLen = 2; - lexer.step(); - if (lexer.rescan_close_brace_as_template_token) { - lexer.token = T.t_template_middle; - } else { - lexer.token = T.t_template_head; - } - break stringLiteral; - } - continue; - } - }, - - quote => { - lexer.step(); - break stringLiteral; - }, - - else => { - // Non-ASCII strings need the slow path - if (lexer.codePoint >= 0x80) { - needsSlowPath = true; - } else if (IS_JSON_FILE and lexer.codePoint < 0x20) { - lexer.syntax_error(); - } - }, - } - lexer.step(); - } - - const text = lexer.source.Contents[lexer.start + 1 .. lexer.end - suffixLen]; - - if (needsSlowPath) { - // Slow path - lexer.string_literal = lexer.decodeEscapeSequences(lexer.start + 1, text); - } else { - // Fast path - - } + lexer.parseStringLiteral(); }, '_', '$', 'a'...'z', 'A'...'Z' => { @@ -636,7 +674,7 @@ pub const Lexer = struct { lexer.step(); } - if (lexer.codePoint == '\\') { + if (lexer.code_point == '\\') { lexer.scanIdentifierWithEscapes(); } else { const contents = lexer.raw(); @@ -684,6 +722,8 @@ pub const Lexer = struct { lexer.token = T.t_syntax_error; }, } + + return; } } @@ -715,17 +755,334 @@ pub const Lexer = struct { } pub fn init(log: logger.Log, source: logger.Source, allocator: *std.mem.Allocator) !Lexer { + var empty_string_literal: []u16 = undefined; var lex = Lexer{ .log = log, .source = source, - .string_literal = try std.ArrayList([]u16).initCapacity(allocator, 16), + .string_literal = empty_string_literal, .prev_error_loc = -1, + .allocator = allocator, }; lex.step(); // lex.next(); return lex; } + + fn parseNumericLiteralOrDot(lexer: *Lexer) void { + // Number or dot; + var first = lexer.code_point; + lexer.step(); + + // Dot without a digit after it; + if (first == '.' and (lexer.code_point < '0' or lexer.code_point > '9')) { + // "..." + if ((lexer.code_point == '.' and + lexer.current < lexer.source.contents.len) and + lexer.source.contents[lexer.current] == '.') + { + lexer.step(); + lexer.step(); + lexer.token = T.t_dot_dot_dot; + return; + } + + // "." + lexer.token = T.t_dot; + return; + } + + var underscoreCount: usize = 0; + var lastUnderscoreEnd: usize = 0; + var hasDotOrExponent = first == '.'; + var base: f32 = 0.0; + lexer.is_legacy_octal_literal = false; + + // Assume this is a number, but potentially change to a bigint later; + lexer.token = T.t_numeric_literal; + + // Check for binary, octal, or hexadecimal literal; + if (first == '0') { + switch (lexer.code_point) { + 'b', 'B' => { + base = 2; + }, + + 'o', 'O' => { + base = 8; + }, + + 'x', 'X' => { + base = 16; + }, + + '0'...'7', '_' => { + base = 8; + lexer.is_legacy_octal_literal = true; + }, + else => {}, + } + } + + if (base != 0) { + // Integer literal; + var isFirst = true; + var isInvalidLegacyOctalLiteral = false; + lexer.number = 0; + if (!lexer.is_legacy_octal_literal) { + lexer.step(); + } + + integerLiteral: while (true) { + switch (lexer.code_point) { + '_' => { + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } + + // The first digit must exist; + if (isFirst or lexer.is_legacy_octal_literal) { + lexer.syntaxError(); + } + + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; + }, + + '0', '1' => { + lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + }, + + '2', '3', '4', '5', '6', '7' => { + if (base == 2) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + }, + '8', '9' => { + if (lexer.is_legacy_octal_literal) { + isInvalidLegacyOctalLiteral = true; + } else if (base < 10) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point - '0'); + }, + 'A', 'B', 'C', 'D', 'E', 'F' => { + if (base != 16) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point + 10 - 'A'); + }, + + 'a', 'b', 'c', 'd', 'e', 'f' => { + if (base != 16) { + lexer.syntaxError(); + } + lexer.number = lexer.number * base + float64(lexer.code_point + 10 - 'a'); + }, + else => { + // The first digit must exist; + if (isFirst) { + lexer.syntaxError(); + } + + break :integerLiteral; + }, + } + + lexer.step(); + isFirst = false; + } + + var isBigIntegerLiteral = lexer.code_point == 'n' and !hasDotOrExponent; + + // Slow path: do we need to re-scan the input as text? + if (isBigIntegerLiteral or isInvalidLegacyOctalLiteral) { + var text = lexer.raw(); + + // Can't use a leading zero for bigint literals; + if (isBigIntegerLiteral and lexer.is_legacy_octal_literal) { + lexer.syntaxError(); + } + + // Filter out underscores; + if (underscoreCount > 0) { + var bytes = lexer.allocator.alloc(u8, text.len - underscoreCount) catch unreachable; + var i: usize = 0; + for (text) |char| { + if (char != '_') { + bytes[i] = char; + i += 1; + } + } + } + + // Store bigints as text to avoid precision loss; + if (isBigIntegerLiteral) { + lexer.identifier = text; + } else if (isInvalidLegacyOctalLiteral) { + if (std.fmt.parseFloat(f64, text)) |num| { + lexer.number = num; + } else |err| { + lexer.addError(lexer.start, "Invalid number {s}", .{text}, true); + } + } + } + } else { + // Floating-point literal; + var isInvalidLegacyOctalLiteral = first == '0' and (lexer.code_point == '8' or lexer.code_point == '9'); + + // Initial digits; + while (true) { + if (lexer.code_point < '0' or lexer.code_point > '9') { + if (lexer.code_point != '_') { + break; + } + + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } + + // The specification forbids underscores in this case; + if (isInvalidLegacyOctalLiteral) { + lexer.syntaxError(); + } + + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; + } + lexer.step(); + } + + // Fractional digits; + if (first != '.' and lexer.code_point == '.') { + // An underscore must not come last; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.end -= 1; + lexer.syntaxError(); + } + + hasDotOrExponent = true; + lexer.step(); + if (lexer.code_point == '_') { + lexer.syntaxError(); + } + while (true) { + if (lexer.code_point < '0' or lexer.code_point > '9') { + if (lexer.code_point != '_') { + break; + } + + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } + + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; + } + lexer.step(); + } + } + + // Exponent; + if (lexer.code_point == 'e' or lexer.code_point == 'E') { + // An underscore must not come last; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.end -= 1; + lexer.syntaxError(); + } + + hasDotOrExponent = true; + lexer.step(); + if (lexer.code_point == '+' or lexer.code_point == '-') { + lexer.step(); + } + if (lexer.code_point < '0' or lexer.code_point > '9') { + lexer.syntaxError(); + } + while (true) { + if (lexer.code_point < '0' or lexer.code_point > '9') { + if (lexer.code_point != '_') { + break; + } + + // Cannot have multiple underscores in a row; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.syntaxError(); + } + + lastUnderscoreEnd = lexer.end; + underscoreCount += 1; + } + lexer.step(); + } + } + + // Take a slice of the text to parse; + var text = lexer.raw(); + + // Filter out underscores; + if (underscoreCount > 0) { + var i: usize = 0; + if (lexer.allocator.alloc(u8, text.len - underscoreCount)) |bytes| { + for (text) |char| { + if (char != '_') { + bytes[i] = char; + i += 1; + } + } + text = bytes; + } else |err| { + lexer.addError(lexer.start, "Out of Memory Wah Wah Wah", .{}, true); + return; + } + } + + if (lexer.code_point == 'n' and !hasDotOrExponent) { + // The only bigint literal that can start with 0 is "0n" + if (text.len > 1 and first == '0') { + lexer.syntaxError(); + } + + // Store bigints as text to avoid precision loss; + lexer.identifier = text; + } else if (!hasDotOrExponent and lexer.end - lexer.start < 10) { + // Parse a 32-bit integer (very fast path); + var number: u32 = 0; + for (text) |c| { + number = number * 10 + @intCast(u32, c - '0'); + } + lexer.number = @intToFloat(f64, number); + } else { + // Parse a double-precision floating-point number; + if (std.fmt.parseFloat(f64, text)) |num| { + lexer.number = num; + } else |err| { + lexer.addError(lexer.start, "Invalid number", .{}, true); + } + } + } + + // An underscore must not come last; + if (lastUnderscoreEnd > 0 and lexer.end == lastUnderscoreEnd + 1) { + lexer.end -= 1; + lexer.syntaxError(); + } + + // Handle bigint literals after the underscore-at-end check above; + if (lexer.code_point == 'n' and !hasDotOrExponent) { + lexer.token = T.t_big_integer_literal; + lexer.step(); + } + + // Identifiers can't occur immediately after numbers; + if (isIdentifierStart(lexer.code_point)) { + lexer.syntaxError(); + } + } }; fn isIdentifierStart(codepoint: CodePoint) bool { @@ -792,18 +1149,43 @@ fn isWhitespace(codepoint: CodePoint) bool { } } -test "Lexer.next()" { - const msgs = std.ArrayList(logger.Msg).init(std.testing.allocator); +fn float64(num: anytype) callconv(.Inline) f64 { + return @intToFloat(f64, num); +} + +fn test_lexer(contents: []const u8) Lexer { + alloc.setup(std.heap.page_allocator) catch unreachable; + const msgs = std.ArrayList(logger.Msg).init(alloc.dynamic); const log = logger.Log{ .msgs = msgs, }; - defer std.testing.allocator.free(msgs.items); - const source = logger.Source.initPathString("index.js", "for (let i = 0; i < 100; i++) { console.log('hi'); }", std.heap.page_allocator); + const source = logger.Source.initPathString("index.js", contents, std.heap.page_allocator); + return Lexer.init(log, source, alloc.dynamic) catch unreachable; +} - var lex = try Lexer.init(log, source, std.testing.allocator); - defer lex.string_literal.shrinkAndFree(0); +// test "Lexer.next()" { +// try alloc.setup(std.heap.page_allocator); +// const msgs = std.ArrayList(logger.Msg).init(alloc.dynamic); +// const log = logger.Log{ +// .msgs = msgs, +// }; + +// const source = logger.Source.initPathString("index.js", "for (let i = 0; i < 100; i++) { console.log('hi'); }", std.heap.page_allocator); +// var lex = try Lexer.init(log, source, alloc.dynamic); +// lex.next(); +// } + +test "Lexer.next() keywords" { + var lex = test_lexer("for (let i = 0; i < 100; i++) { }"); + lex.next(); + std.testing.expectEqualStrings("\"for\"", tokenToString.get(lex.token)); + lex.next(); + std.testing.expectEqualStrings("\"(\"", tokenToString.get(lex.token)); + lex.next(); + std.testing.expectEqualStrings("\"let\"", tokenToString.get(lex.token)); lex.next(); + std.testing.expectEqualStrings("\"identifier\"", tokenToString.get(lex.token)); } test "Lexer.step()" { @@ -816,7 +1198,6 @@ test "Lexer.step()" { const source = logger.Source.initPathString("index.js", "for (let i = 0; i < 100; i++) { console.log('hi'); }", std.heap.page_allocator); var lex = try Lexer.init(log, source, std.testing.allocator); - defer lex.string_literal.shrinkAndFree(0); std.testing.expect('f' == lex.code_point); lex.step(); std.testing.expect('o' == lex.code_point); diff --git a/src/logger.zig b/src/logger.zig index 8180fd04b..a69a8dfa0 100644 --- a/src/logger.zig +++ b/src/logger.zig @@ -108,6 +108,7 @@ pub const Log = struct { } pub fn addRangeError(log: *Log, source: ?Source, r: Range, text: []u8) void { + log.errors += 1; log.addMsg(Msg{ .kind = .Error, .data = rangeData(source, r, text), @@ -115,6 +116,7 @@ pub const Log = struct { } pub fn addRangeWarning(log: *Log, source: ?Source, r: Range, text: []u8) void { + log.warnings += 1; log.addMsg(Msg{ .kind = .warning, .data = rangeData(source, r, text), @@ -129,6 +131,7 @@ pub const Log = struct { } pub fn addRangeErrorWithNotes(log: *Log, source: ?Source, r: Range, text: []u8, notes: []Data) void { + log.errors += 1; log.addMsg(Msg{ .kind = Kind.err, .data = rangeData(source, r, text), @@ -137,6 +140,7 @@ pub const Log = struct { } pub fn addRangeWarningWithNotes(log: *Log, source: ?Source, r: Range, text: []u8, notes: []Data) void { + log.warnings += 1; log.addMsg(Msg{ .kind = .warning, .data = rangeData(source, r, text), @@ -151,8 +155,8 @@ pub const Log = struct { // TODO: pub fn addError(self: *Log, _source: ?Source, loc: Loc, text: []u8) !void { - try self.addMsg(Msg{ .kind = .err, .data = rangeData(_source, Range{ .loc = loc }, text) }); self.errors += 1; + try self.addMsg(Msg{ .kind = .err, .data = rangeData(_source, Range{ .loc = loc }, text) }); } // TODO: |