wip

author: Jarred Sumner <jarred@jarredsumner.com> 2021-04-18 20:43:04 -0700
committer: Jarred Sumner <jarred@jarredsumner.com> 2021-04-18 20:43:04 -0700
commit: 97c6c4b036d61ad5a42dff52340a6eb44dd8c855 (patch)
tree: de4fdd31827f9423db2566c9b62792ab9cb2d2c6 /src/js_lexer.zig
parent: 41367499de215d8125310118ef82737c9aca8a57 (diff)
download: bun-97c6c4b036d61ad5a42dff52340a6eb44dd8c855.tar.gz
bun-97c6c4b036d61ad5a42dff52340a6eb44dd8c855.tar.zst
bun-97c6c4b036d61ad5a42dff52340a6eb44dd8c855.zip
1 files changed, 328 insertions, 15 deletions
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index 41c5a084e..4901d8153 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -10,7 +10,15 @@ pub const Keywords = tables.Keywords;
 pub const tokenToString = tables.tokenToString;
 pub const jsxEntity = tables.jsxEntity;
 
+const string = []const u8;
+
 pub const Lexer = struct {
+    // pub const Error = error{
+    //     UnexpectedToken,
+    //     EndOfFile,
+    // };
+
+    // err: ?Lexer.Error,
     log: logger.Log,
     source: logger.Source,
     current: usize = 0,
@@ -26,9 +34,9 @@ pub const Lexer = struct {
     is_legacy_octal_literal: bool = false,
     // comments_to_preserve_before: []js_ast.Comment,
     // all_original_comments: []js_ast.Comment,
-    code_point: CodePoint = 0,
-    string_literal: []u16,
-    identifier: []u8 = "",
+    code_point: CodePoint = -1,
+    string_literal: std.ArrayList([]u16),
+    identifier: []const u8 = "",
     // jsx_factory_pragma_comment: js_ast.Span,
     // jsx_fragment_pragma_comment: js_ast.Span,
     // source_mapping_url: js_ast.Span,
@@ -48,12 +56,41 @@ pub const Lexer = struct {
         return it.source.contents[it.current - cp_len .. it.current];
     }
 
-    pub fn addError(self: *Lexer, loc: logger.Loc, text: []u8) void {
-        if (loc == self.prevErrorLoc) {
+    pub fn syntax_error(self: *Lexer) void {
+        self.addError(self.start, "Syntax Error!!", .{}, true);
+    }
+
+    pub fn addError(self: *Lexer, _loc: usize, comptime format: []const u8, args: anytype, panic: bool) void {
+        const loc = logger.usize2Loc(_loc);
+        if (loc == self.prev_error_loc) {
+            return;
+        }
+
+        const errorMessage = std.fmt.allocPrint(self.string_literal.allocator, format, args) catch unreachable;
+        self.log.addError(self.source, loc, errorMessage) catch unreachable;
+        self.prev_error_loc = loc;
+
+        if (panic) {
+            self.doPanic(errorMessage);
+        }
+    }
+
+    pub fn addRangeError(self: *Lexer, range: logger.Range, comptime format: []const u8, args: anytype, panic: bool) void {
+        if (loc == self.prev_error_loc) {
             return;
         }
 
+        const errorMessage = std.fmt.allocPrint(self.string_literal.allocator, format, args) catch unreachable;
+        var msg = self.log.addRangeError(self.source, range, errorMessage);
         self.prev_error_loc = loc;
+
+        if (panic) {
+            self.doPanic(errorMessage);
+        }
+    }
+
+    fn doPanic(self: *Lexer, content: []const u8) void {
+        std.debug.panic("{s}", .{content});
     }
 
     pub fn codePointEql(self: *Lexer, a: u8) bool {
@@ -61,7 +98,7 @@ pub const Lexer = struct {
     }
 
     fn nextCodepoint(it: *Lexer) callconv(.Inline) CodePoint {
-        const slice = it.nextCodepointSlice() orelse return @as(CodePoint, 0);
+        const slice = it.nextCodepointSlice() orelse return @as(CodePoint, -1);
 
         switch (slice.len) {
             1 => return @as(CodePoint, slice[0]),
@@ -118,39 +155,315 @@ pub const Lexer = struct {
         }
     }
 
-    pub fn next(self: *Lexer) void {}
+    pub fn addUnsupportedSyntaxError(self: *Lexer, msg: []const u8) void {
+        self.addError(self.end, "Unsupported syntax: {s}", .{msg}, true);
+    }
+
+    pub fn scanIdentifierWithEscapes(self: *Lexer) void {
+        self.addUnsupportedSyntaxError("escape sequence");
+        return;
+    }
+
+    pub fn next(lexer: *Lexer) void {
+        lexer.has_newline_before = lexer.end == 0;
+
+        while (true) {
+            lexer.start = lexer.end;
+            lexer.token = T.t_end_of_file;
+
+            switch (lexer.code_point) {
+                -1 => {
+                    lexer.token = T.t_end_of_file;
+                },
+
+                '#' => {
+                    if (lexer.start == 0 and lexer.source.contents[1] == '!') {
+                        lexer.addUnsupportedSyntaxError("#!hashbang is not supported yet.");
+                        return;
+                    }
+
+                    lexer.step();
+                    if (!isIdentifierStart(lexer.code_point)) {
+                        lexer.syntax_error();
+                    }
+                    lexer.step();
+
+                    if (isIdentifierStart(lexer.code_point)) {
+                        lexer.step();
+                        while (isIdentifierContinue(lexer.code_point)) {
+                            lexer.step();
+                        }
+                        if (lexer.code_point == '\\') {
+                            lexer.scanIdentifierWithEscapes();
+                            lexer.token = T.t_private_identifier;
+                            // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier);
+                        } else {
+                            lexer.token = T.t_private_identifier;
+                            lexer.identifier = lexer.raw();
+                        }
+                        break;
+                    }
+                },
+                '\r', '\n', 0x2028, 0x2029 => {
+                    lexer.step();
+                    lexer.has_newline_before = true;
+                    continue;
+                },
+
+                '\t', ' ' => {
+                    lexer.step();
+                    continue;
+                },
+
+                '(' => {
+                    lexer.step();
+                    lexer.token = T.t_open_paren;
+                },
+                ')' => {
+                    lexer.step();
+                    lexer.token = T.t_close_paren;
+                },
+                '[' => {
+                    lexer.step();
+                    lexer.token = T.t_open_bracket;
+                },
+                ']' => {
+                    lexer.step();
+                    lexer.token = T.t_close_bracket;
+                },
+                '{' => {
+                    lexer.step();
+                    lexer.token = T.t_open_brace;
+                },
+                '}' => {
+                    lexer.step();
+                    lexer.token = T.t_close_brace;
+                },
+                ',' => {
+                    lexer.step();
+                    lexer.token = T.t_comma;
+                },
+                ':' => {
+                    lexer.step();
+                    lexer.token = T.t_colon;
+                },
+                ';' => {
+                    lexer.step();
+                    lexer.token = T.t_semicolon;
+                },
+                '@' => {
+                    lexer.step();
+                    lexer.token = T.t_at;
+                },
+                '~' => {
+                    lexer.step();
+                    lexer.token = T.t_tilde;
+                },
+
+                '?' => {
+                    // '?' or '?.' or '??' or '??='
+                    lexer.step();
+                    switch (lexer.code_point) {
+                        '?' => {
+                            lexer.step();
+                            switch (lexer.code_point) {
+                                '=' => {
+                                    lexer.step();
+                                    lexer.token = T.t_question_question_equals;
+                                },
+                                else => {
+                                    lexer.token = T.t_question_question;
+                                },
+                            }
+                        },
+
+                        '.' => {
+                            lexer.token = T.t_question;
+                            const current = lexer.current;
+                            const contents = lexer.source.contents;
+
+                            // Lookahead to disambiguate with 'a?.1:b'
+                            if (current < contents.len) {
+                                const c = contents[current];
+                                if (c < '0' or c > '9') {
+                                    lexer.step();
+                                    lexer.token = T.t_question_dot;
+                                }
+                            }
+                        },
+                        else => {
+                            lexer.token = T.t_question;
+                        },
+                    }
+                },
+
+                '%' => {
+                    // '%' or '%='
+                    lexer.step();
+                    switch (lexer.code_point) {
+                        '=' => {
+                            lexer.step();
+                            lexer.token = T.t_percent_equals;
+                        },
+
+                        else => {
+                            lexer.token = T.t_percent;
+                        },
+                    }
+                },
 
-    pub fn init(log: logger.Log, source: logger.Source) Lexer {
-        var string_literal = [1]u16{0};
+                else => {
+                    // Check for unusual whitespace characters
+                    if (isWhitespace(lexer.code_point)) {
+                        lexer.step();
+                        continue;
+                    }
 
+                    if (isIdentifierStart(lexer.code_point)) {
+                        lexer.step();
+                        while (isIdentifierContinue(lexer.code_point)) {
+                            lexer.step();
+                        }
+                        if (lexer.code_point == '\\') {
+
+                            // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier);
+                        } else {
+                            lexer.token = T.t_identifier;
+                            lexer.identifier = lexer.raw();
+                        }
+                        break;
+                    }
+
+                    lexer.end = lexer.current;
+                    lexer.token = T.t_syntax_error;
+                },
+            }
+        }
+    }
+
+    pub fn expected(self: *Lexer, token: T) void {
+        if (tokenToString.has(text)) {
+            self.expectedString(text);
+        } else {
+            self.unexpected();
+        }
+    }
+
+    pub fn raw(self: *Lexer) []const u8 {
+        return self.source.contents[self.start..self.end];
+    }
+
+    pub fn expectedString(self: *Lexer, text: string) void {
+        var found = text;
+        if (self.source.contents.len == self.start) {
+            found = "end of file";
+        }
+        self.addRangeError(self.range(), "Expected %s but found %s", .{ text, found }, true);
+    }
+
+    pub fn range(self: *Lexer) logger.Range {
+        return logger.Range{
+            .start = self.start,
+            .len = self.end - self.start,
+        };
+    }
+
+    pub fn init(log: logger.Log, source: logger.Source, allocator: *std.mem.Allocator) !Lexer {
         var lex = Lexer{
             .log = log,
             .source = source,
-            .string_literal = &string_literal,
+            .string_literal = try std.ArrayList([]u16).initCapacity(allocator, 16),
             .prev_error_loc = -1,
         };
         lex.step();
-        lex.next();
+        // lex.next();
 
         return lex;
     }
 };
 
+fn isIdentifierStart(codepoint: CodePoint) bool {
+    switch (codepoint) {
+        'a'...'z', 'A'...'Z', '_', '$' => {
+            return true;
+        },
+        else => {
+            return false;
+        },
+    }
+}
+fn isIdentifierContinue(codepoint: CodePoint) bool {
+    switch (codepoint) {
+        '_', '$', '0'...'9', 'a'...'z', 'A'...'Z' => {
+            return true;
+        },
+        else => {},
+    }
+
+    // All ASCII identifier start code points are listed above
+    if (codepoint < 0x7F) {
+        return false;
+    }
+
+    // ZWNJ and ZWJ are allowed in identifiers
+    if (codepoint == 0x200C or codepoint == 0x200D) {
+        return true;
+    }
+
+    return false;
+}
+
+fn isWhitespace(codepoint: CodePoint) bool {
+    switch (codepoint) {
+        0x000B, // line tabulation
+        0x0009, // character tabulation
+        0x000C, // form feed
+        0x0020, // space
+        0x00A0, // no-break space
+        // Unicode "Space_Separator" code points
+        0x1680, // ogham space mark
+        0x2000, // en quad
+        0x2001, // em quad
+        0x2002, // en space
+        0x2003, // em space
+        0x2004, // three-per-em space
+        0x2005, // four-per-em space
+        0x2006, // six-per-em space
+        0x2007, // figure space
+        0x2008, // punctuation space
+        0x2009, // thin space
+        0x200A, // hair space
+        0x202F, // narrow no-break space
+        0x205F, // medium mathematical space
+        0x3000, // ideographic space
+        0xFEFF,
+        => {
+            return true;
+        }, // zero width non-breaking space
+        else => {
+            return false;
+        },
+    }
+}
+
 test "Lexer.step()" {
     const msgs = std.ArrayList(logger.Msg).init(std.testing.allocator);
     const log = logger.Log{
         .msgs = msgs,
     };
 
-    var sourcefile = "for (let i = 0; i < 100; i++) { console.log('hi'); }".*;
-    var identifier_name = "loop".*;
     defer std.testing.allocator.free(msgs.items);
-    const source = logger.Source{ .index = 0, .contents = &sourcefile, .identifier_name = &identifier_name };
+    const source = logger.Source.initPathString("index.js", "for (let i = 0; i < 100; i++) { console.log('hi'); }", std.heap.page_allocator);
 
-    var lex = Lexer.init(log, source);
+    var lex = try Lexer.init(log, source, std.testing.allocator);
+    defer lex.string_literal.shrinkAndFree(0);
     std.testing.expect('f' == lex.code_point);
     lex.step();
     std.testing.expect('o' == lex.code_point);
     lex.step();
     std.testing.expect('r' == lex.code_point);
+    while (lex.current < source.contents.len) {
+        std.testing.expect(lex.code_point == source.contents[lex.current - 1]);
+        lex.step();
+    }
 }
author	Jarred Sumner <jarred@jarredsumner.com>	2021-04-18 20:43:04 -0700
committer	Jarred Sumner <jarred@jarredsumner.com>	2021-04-18 20:43:04 -0700
commit	97c6c4b036d61ad5a42dff52340a6eb44dd8c855 (patch)
tree	de4fdd31827f9423db2566c9b62792ab9cb2d2c6 /src/js_lexer.zig
parent	41367499de215d8125310118ef82737c9aca8a57 (diff)
download	bun-97c6c4b036d61ad5a42dff52340a6eb44dd8c855.tar.gz bun-97c6c4b036d61ad5a42dff52340a6eb44dd8c855.tar.zst bun-97c6c4b036d61ad5a42dff52340a6eb44dd8c855.zip