aboutsummaryrefslogtreecommitdiff
path: root/src/js_lexer.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/js_lexer.zig')
-rw-r--r--src/js_lexer.zig608
1 files changed, 548 insertions, 60 deletions
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index 96b6f6835..4b78f2536 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -191,25 +191,258 @@ pub const Lexer = struct {
}
}
- fn parseStringLiteral(lexer: *LexerType) !void {
- var quote: CodePoint = lexer.code_point;
- var needs_slow_path = false;
- var suffixLen: usize = 1;
+ pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, buf: anytype) !void {
+ var iter = CodepointIterator{ .bytes = text[start..], .i = 0 };
+ const start_length = buf.items.len;
+ while (iter.nextCodepoint()) |c| {
+ const width = iter.width;
- if (quote != '`') {
- lexer.token = T.t_string_literal;
- } else if (lexer.rescan_close_brace_as_template_token) {
- lexer.token = T.t_template_tail;
- } else {
- lexer.token = T.t_no_substitution_template_literal;
+ switch (c) {
+ '\r' => {
+ // From the specification:
+ //
+ // 11.8.6.1 Static Semantics: TV and TRV
+ //
+ // TV excludes the code units of LineContinuation while TRV includes
+ // them. <CR><LF> and <CR> LineTerminatorSequences are normalized to
+ // <LF> for both TV and TRV. An explicit EscapeSequence is needed to
+ // include a <CR> or <CR><LF> sequence.
+
+ // Convert '\r\n' into '\n'
+ if (iter.i < text.len and text[iter.i] == '\n') {
+ iter.i += 1;
+ }
+
+ // Convert '\r' into '\n'
+ buf.append('\n') catch unreachable;
+ continue;
+ },
+
+ '\\' => {
+ const c2 = iter.nextCodepoint() orelse return;
+ const width2 = iter.width;
+ switch (c2) {
+ 'b' => {
+ buf.append(std.mem.readIntNative(u16, "\\b")) catch unreachable;
+ continue;
+ },
+ 'f' => {
+ buf.append(std.mem.readIntNative(u16, "\\f")) catch unreachable;
+ continue;
+ },
+ 'n' => {
+ buf.append(std.mem.readIntNative(u16, "\\n")) catch unreachable;
+ continue;
+ },
+ 'r' => {
+ buf.append(std.mem.readIntNative(u16, "\\r")) catch unreachable;
+ continue;
+ },
+ 't' => {
+ buf.append(std.mem.readIntNative(u16, "\\t")) catch unreachable;
+ continue;
+ },
+ 'v' => {
+ if (lexer.json_options != null) {
+ lexer.end = start + iter.i - width2;
+ try lexer.syntaxError();
+ }
+
+ buf.append(std.mem.readIntNative(u16, "\\v")) catch unreachable;
+ continue;
+ },
+ '0'...'7' => {
+ try lexer.addUnsupportedSyntaxError("Legacy octal literals are not supported.");
+ },
+ '8', '9' => {
+ try lexer.addUnsupportedSyntaxError("Legacy octal literals are not supported.");
+ },
+ 'x' => {
+ if (lexer.json_options != null) {
+ lexer.end = start + iter.i - width2;
+ try lexer.syntaxError();
+ }
+
+ var value: CodePoint = 0;
+ var c3: CodePoint = 0;
+ var width3: u3 = 0;
+ comptime var j: usize = 0;
+ inline while (j < 2) : (j += 1) {
+ c3 = iter.nextCodepoint() orelse return lexer.syntaxError();
+ width3 = iter.width;
+ switch (c3) {
+ '0'...'9' => {
+ value = value * 16 | (c3 - '0');
+ },
+ 'a'...'f' => {
+ value = value * 16 | (c3 + 10 - 'a');
+ },
+ 'A'...'F' => {
+ value = value * 16 | (c3 + 10 - 'A');
+ },
+ else => {
+ lexer.end = start + iter.i - width3;
+ return lexer.syntaxError();
+ },
+ }
+ }
+ iter.c = value;
+ },
+ 'u' => {
+ // We're going to make this an i64 so we don't risk integer overflows
+ // when people do weird things
+ var value: i64 = 0;
+
+ var c3 = iter.nextCodepoint() orelse return lexer.syntaxError();
+ var width3 = iter.width;
+
+ // variable-length
+ if (c3 == '{') {
+ if (lexer.json_options != null) {
+ lexer.end = start + iter.i - width2;
+ try lexer.syntaxError();
+ }
+
+ const hex_start = iter.i - width - width2 - width3;
+ var is_first = true;
+ var is_out_of_range = false;
+ variableLength: while (true) {
+ c3 = iter.nextCodepoint() orelse break :variableLength;
+
+ switch (c3) {
+ '0'...'9' => {
+ value = value * 16 | (c3 - '0');
+ },
+ 'a'...'f' => {
+ value = value * 16 | (c3 + 10 - 'a');
+ },
+ 'A'...'F' => {
+ value = value * 16 | (c3 + 10 - 'A');
+ },
+ '}' => {
+ if (is_first) {
+ lexer.end = start + iter.i - width3;
+ return lexer.syntaxError();
+ }
+ break :variableLength;
+ },
+ else => {
+ lexer.end = start + iter.i - width3;
+ return lexer.syntaxError();
+ },
+ }
+
+ // '\U0010FFFF
+ // copied from golang utf8.MaxRune
+ if (value > 1114111) {
+ is_out_of_range = true;
+ }
+ is_first = false;
+ }
+
+ if (is_out_of_range) {
+ try lexer.addRangeError(
+ .{ .loc = .{ .start = @intCast(i32, start + hex_start) }, .len = @intCast(i32, (iter.i - hex_start)) },
+ "Unicode escape sequence is out of range",
+ .{},
+ true,
+ );
+ return;
+ }
+
+ // fixed-length
+ } else {
+ // Fixed-length
+ comptime var j: usize = 0;
+ inline while (j < 4) : (j += 1) {
+ switch (c3) {
+ '0'...'9' => {
+ value = value * 16 | (c3 - '0');
+ },
+ 'a'...'f' => {
+ value = value * 16 | (c3 + 10 - 'a');
+ },
+ 'A'...'F' => {
+ value = value * 16 | (c3 + 10 - 'A');
+ },
+ else => {
+ lexer.end = start + iter.i - width3;
+ return lexer.syntaxError();
+ },
+ }
+
+ if (j < 3) {
+ c3 = iter.nextCodepoint() orelse return lexer.syntaxError();
+ width3 = iter.width;
+ }
+ }
+ }
+
+ iter.c = @truncate(CodePoint, value);
+ },
+ '\r' => {
+ if (lexer.json_options != null) {
+ lexer.end = start + iter.i - width2;
+ try lexer.syntaxError();
+ }
+
+ // Ignore line continuations. A line continuation is not an escaped newline.
+ if (iter.i < text.len and text[iter.i + 1] == '\n') {
+ // Make sure Windows CRLF counts as a single newline
+ iter.i += 1;
+ }
+ continue;
+ },
+ '\n', 0x2028, 0x2029 => {
+ if (lexer.json_options != null) {
+ lexer.end = start + iter.i - width2;
+ try lexer.syntaxError();
+ }
+
+ // Ignore line continuations. A line continuation is not an escaped newline.
+ continue;
+ },
+ else => {
+ if (lexer.json_options != null) {
+ switch (c2) {
+ '"', '\\', '/' => {},
+ else => {
+ lexer.end = start + iter.i - width2;
+ try lexer.syntaxError();
+ },
+ }
+ }
+ iter.c = c2;
+ },
+ }
+ },
+ else => {},
+ }
+
+ if (iter.c <= 0xFFFF) {
+ buf.append(@intCast(u16, c)) catch unreachable;
+ } else {
+ iter.c -= 0x10000;
+ buf.ensureUnusedCapacity(2) catch unreachable;
+ buf.appendAssumeCapacity(@intCast(u16, 0xD800 + ((iter.c >> 10) & 0x3FF)));
+ buf.appendAssumeCapacity(@intCast(u16, 0xDC00 + (iter.c & 0x3FF)));
+ }
}
- try lexer.step();
+ }
+ pub const InnerStringLiteral = packed struct { suffix_len: u3, needs_slow_path: bool };
+ fn parseStringLiteralInnter(lexer: *LexerType, comptime quote: CodePoint) !InnerStringLiteral {
+ var needs_slow_path = false;
+ var suffix_len: u3 = 1;
stringLiteral: while (true) {
switch (lexer.code_point) {
'\\' => {
- needs_slow_path = true;
try lexer.step();
+ // Skip slow path for \n in a string literal
+ // This is pretty common, shows up in e.g. React
+ // Example code: array.split("\n")
+ // We don't need to decode as UTF16 for that. We know it's just a newline char.
+ needs_slow_path = lexer.code_point != 'n';
// Handle Windows CRLF
if (lexer.code_point == '\r' and lexer.json_options != null) {
@@ -245,7 +478,7 @@ pub const Lexer = struct {
if (quote == '`') {
try lexer.step();
if (lexer.code_point == '{') {
- suffixLen = 2;
+ suffix_len = 2;
try lexer.step();
if (lexer.rescan_close_brace_as_template_token) {
lexer.token = T.t_template_middle;
@@ -257,12 +490,15 @@ pub const Lexer = struct {
continue :stringLiteral;
}
},
+ // exit condition
+ quote => {
+ try lexer.step();
+
+ break;
+ },
else => {
- if (quote == lexer.code_point) {
- try lexer.step();
- break :stringLiteral;
- }
+
// Non-ASCII strings need the slow path
if (lexer.code_point >= 0x80) {
needs_slow_path = true;
@@ -274,19 +510,41 @@ pub const Lexer = struct {
try lexer.step();
}
+ return InnerStringLiteral{ .needs_slow_path = needs_slow_path, .suffix_len = suffix_len };
+ }
+
+ fn parseStringLiteral(lexer: *LexerType) !void {
+ var quote: CodePoint = lexer.code_point;
+
+ if (quote != '`') {
+ lexer.token = T.t_string_literal;
+ } else if (lexer.rescan_close_brace_as_template_token) {
+ lexer.token = T.t_template_tail;
+ } else {
+ lexer.token = T.t_no_substitution_template_literal;
+ }
+ try lexer.step();
+
+ var string_literal_details = switch (quote) {
+ '`' => try lexer.parseStringLiteralInnter('`'),
+ '\'' => try lexer.parseStringLiteralInnter('\''),
+ '"' => try lexer.parseStringLiteralInnter('"'),
+ else => unreachable,
+ };
+
// Reset string literal
- lexer.string_literal_slice = lexer.source.contents[lexer.start + 1 .. lexer.end - suffixLen];
- lexer.string_literal_is_ascii = !needs_slow_path;
+ lexer.string_literal_slice = lexer.source.contents[lexer.start + 1 .. lexer.end - string_literal_details.suffix_len];
+ lexer.string_literal_is_ascii = !string_literal_details.needs_slow_path;
lexer.string_literal_buffer.shrinkRetainingCapacity(0);
- if (needs_slow_path) {
- lexer.string_literal_buffer.ensureTotalCapacity(lexer.string_literal_slice.len) catch unreachable;
- var slice = lexer.string_literal_buffer.allocatedSlice();
- lexer.string_literal_buffer.items = slice[0..strings.toUTF16Buf(lexer.string_literal_slice, slice)];
+ if (string_literal_details.needs_slow_path) {
+ lexer.string_literal_buffer.ensureUnusedCapacity(lexer.string_literal_slice.len) catch unreachable;
+ try lexer.decodeEscapeSequences(0, lexer.string_literal_slice, &lexer.string_literal_buffer);
}
if (quote == '\'' and lexer.json_options != null) {
try lexer.addRangeError(lexer.range(), "JSON strings must use double quotes", .{}, true);
}
+
// for (text)
// // if (needs_slow_path) {
// // // Slow path
@@ -333,8 +591,131 @@ pub const Lexer = struct {
return Error.SyntaxError;
}
- pub fn scanIdentifierWithEscapes(self: *LexerType) !void {
- try self.addUnsupportedSyntaxError("escape sequence");
+ pub const IdentifierKind = enum { normal, private };
+ pub const ScanResult = struct { token: T, contents: string };
+ threadlocal var small_escape_sequence_buffer: [4096]u16 = undefined;
+ const FakeArrayList16 = struct {
+ items: []u16,
+ i: usize = 0,
+
+ pub fn append(fake: *FakeArrayList16, value: u16) !void {
+ std.debug.assert(fake.items.len < fake.i);
+ fake.items[fake.i] = value;
+ fake.i += 1;
+ }
+
+ pub fn appendAssumeCapacity(fake: *FakeArrayList16, value: u16) void {
+ std.debug.assert(fake.items.len < fake.i);
+ fake.items[fake.i] = value;
+ fake.i += 1;
+ }
+ pub fn ensureUnusedCapacity(fake: *FakeArrayList16, int: anytype) !void {
+ std.debug.assert(fake.items.len < fake.i + int);
+ }
+ };
+ threadlocal var large_escape_sequence_list: std.ArrayList(u16) = undefined;
+ threadlocal var large_escape_sequence_list_loaded: bool = false;
+
+ // This is an edge case that doesn't really exist in the wild, so it doesn't
+ // need to be as fast as possible.
+ pub fn scanIdentifierWithEscapes(lexer: *LexerType, comptime kind: IdentifierKind) !ScanResult {
+ var result = ScanResult{ .token = .t_end_of_file, .contents = "" };
+ // First pass: scan over the identifier to see how long it is
+ while (true) {
+ // Scan a unicode escape sequence. There is at least one because that's
+ // what caused us to get on this slow path in the first place.
+ if (lexer.code_point == '\\') {
+ try lexer.step();
+ if (lexer.code_point != 'u') {
+ try lexer.syntaxError();
+ }
+ try lexer.step();
+ if (lexer.code_point == '{') {
+ // Variable-length
+ try lexer.step();
+ while (lexer.code_point != '}') {
+ switch (lexer.code_point) {
+ '0'...'9', 'a'...'f', 'A'...'F' => {
+ try lexer.step();
+ },
+ else => {
+ try lexer.syntaxError();
+ },
+ }
+ }
+
+ try lexer.step();
+ } else {
+ // Fixed-length
+ comptime var j: usize = 0;
+ inline while (j < 4) : (j += 1) {
+ switch (lexer.code_point) {
+ '0'...'9', 'a'...'f', 'A'...'F' => {
+ try lexer.step();
+ },
+ else => {
+ try lexer.syntaxError();
+ },
+ }
+ }
+ }
+ continue;
+ }
+
+ if (!isIdentifierContinue(lexer.code_point)) {
+ break;
+ }
+ try lexer.step();
+ }
+
+ // Second pass: re-use our existing escape sequence parser
+ var original_text = lexer.raw();
+ if (original_text.len < 1024) {
+ var buf = FakeArrayList16{ .items = &small_escape_sequence_buffer, .i = 0 };
+ try lexer.decodeEscapeSequences(lexer.start, original_text, &buf);
+ result.contents = lexer.utf16ToString(buf.items[0..buf.i]);
+ } else {
+ if (!large_escape_sequence_list_loaded) {
+ large_escape_sequence_list = try std.ArrayList(u16).initCapacity(lexer.allocator, original_text.len);
+ large_escape_sequence_list_loaded = true;
+ }
+
+ large_escape_sequence_list.shrinkRetainingCapacity(0);
+ try lexer.decodeEscapeSequences(lexer.start, original_text, &large_escape_sequence_list);
+ result.contents = lexer.utf16ToString(large_escape_sequence_list.items);
+ }
+
+ var identifier = result.contents;
+ if (kind == .private) {
+ identifier = result.contents[1..];
+ }
+
+ if (!isIdentifier(identifier)) {
+ try lexer.addRangeError(
+ .{ .loc = logger.usize2Loc(lexer.start), .len = @intCast(i32, lexer.end - lexer.start) },
+ "Invalid identifier: \"{s}\"",
+ .{result.contents},
+ true,
+ );
+ }
+ result.contents = identifier;
+
+ // Escaped keywords are not allowed to work as actual keywords, but they are
+ // allowed wherever we allow identifiers or keywords. For example:
+ //
+ // // This is an error (equivalent to "var var;")
+ // var \u0076\u0061\u0072;
+ //
+ // // This is an error (equivalent to "var foo;" except for this rule)
+ // \u0076\u0061\u0072 foo;
+ //
+ // // This is an fine (equivalent to "foo.var;")
+ // foo.\u0076\u0061\u0072;
+ //
+ result.token = if (Keywords.has(result.contents)) .t_escaped_keyword else .t_identifier;
+
+ // const text = lexer.decodeEscapeSequences(lexer.start, lexer.raw(), )
+ return result;
}
pub fn debugInfo(self: *LexerType) void {
@@ -462,31 +843,46 @@ pub const Lexer = struct {
'#' => {
if (lexer.start == 0 and lexer.source.contents[1] == '!') {
- try lexer.addUnsupportedSyntaxError("#!hashbang is not supported yet.");
- return;
- }
-
- try lexer.step();
- if (!isIdentifierStart(lexer.code_point)) {
- try lexer.syntaxError();
- }
- try lexer.step();
-
- if (isIdentifierStart(lexer.code_point)) {
- try lexer.step();
- while (isIdentifierContinue(lexer.code_point)) {
+ // "#!/usr/bin/env node"
+ lexer.token = .t_hashbang;
+ hashbang: while (true) {
try lexer.step();
+ switch (lexer.code_point) {
+ '\r', '\n', 0x2028, 0x2029 => {
+ break :hashbang;
+ },
+ -1 => {
+ break :hashbang;
+ },
+ else => {},
+ }
}
+ lexer.identifier = lexer.raw();
+ } else {
+ try lexer.step();
if (lexer.code_point == '\\') {
- try lexer.scanIdentifierWithEscapes();
+ const scan_result = try lexer.scanIdentifierWithEscapes(.private);
+ lexer.identifier = scan_result.contents;
lexer.token = T.t_private_identifier;
-
- // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier);
} else {
- lexer.token = T.t_private_identifier;
- lexer.identifier = lexer.raw();
+ if (!isIdentifierStart(lexer.code_point)) {
+ try lexer.syntaxError();
+ }
+
+ try lexer.step();
+ while (isIdentifierContinue(lexer.code_point)) {
+ try lexer.step();
+ }
+ if (lexer.code_point == '\\') {
+ const scan_result = try lexer.scanIdentifierWithEscapes(.private);
+ lexer.identifier = scan_result.contents;
+ lexer.token = T.t_private_identifier;
+ } else {
+ lexer.token = T.t_private_identifier;
+ lexer.identifier = lexer.raw();
+ }
+ break;
}
- break;
}
},
'\r', '\n', 0x2028, 0x2029 => {
@@ -966,7 +1362,9 @@ pub const Lexer = struct {
}
if (lexer.code_point == '\\') {
- try lexer.scanIdentifierWithEscapes();
+ const scan_result = try lexer.scanIdentifierWithEscapes(.normal);
+ lexer.identifier = scan_result.contents;
+ lexer.token = scan_result.token;
} else {
const contents = lexer.raw();
lexer.identifier = contents;
@@ -975,8 +1373,9 @@ pub const Lexer = struct {
},
'\\' => {
- // TODO: normal
- try lexer.scanIdentifierWithEscapes();
+ const scan_result = try lexer.scanIdentifierWithEscapes(.normal);
+ lexer.identifier = scan_result.contents;
+ lexer.token = scan_result.token;
},
'.', '0'...'9' => {
@@ -996,8 +1395,9 @@ pub const Lexer = struct {
try lexer.step();
}
if (lexer.code_point == '\\') {
-
- // lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier);
+ const scan_result = try lexer.scanIdentifierWithEscapes(.normal);
+ lexer.identifier = scan_result.contents;
+ lexer.token = scan_result.token;
} else {
lexer.token = T.t_identifier;
lexer.identifier = lexer.raw();
@@ -2143,26 +2543,114 @@ pub fn isIdentifierUTF16(text: JavascriptString) bool {
return true;
}
+pub const CodepointIterator = struct {
+ bytes: []const u8,
+ i: usize,
+ width: u3 = 0,
+ c: CodePoint = 0,
+
+ pub fn nextCodepointSlice(it: *CodepointIterator) ?[]const u8 {
+ if (it.i >= it.bytes.len) {
+ return null;
+ }
+
+ const cp_len = std
+ .unicode.utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
+ it.i += cp_len;
+ return it.bytes[it.i - cp_len .. it.i];
+ }
+
+ pub fn nextCodepoint(it: *CodepointIterator) ?CodePoint {
+ const slice = it.nextCodepointSlice() orelse return null;
+ it.width = @intCast(u3, slice.len);
+
+ it.c = switch (it.width) {
+ 1 => @intCast(CodePoint, slice[0]),
+ 2 => @intCast(CodePoint, std.unicode.utf8Decode2(slice) catch unreachable),
+ 3 => @intCast(CodePoint, std.unicode.utf8Decode3(slice) catch unreachable),
+ 4 => @intCast(CodePoint, std.unicode.utf8Decode4(slice) catch unreachable),
+ else => unreachable,
+ };
+
+ return it.c;
+ }
+
+ /// Look ahead at the next n codepoints without advancing the iterator.
+ /// If fewer than n codepoints are available, then return the remainder of the string.
+ pub fn peek(it: *CodepointIterator, n: usize) []const u8 {
+ const original_i = it.i;
+ defer it.i = original_i;
+
+ var end_ix = original_i;
+ var found: usize = 0;
+ while (found < n) : (found += 1) {
+ const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
+ end_ix += next_codepoint.len;
+ }
+
+ return it.bytes[original_i..end_ix];
+ }
+};
+
// TODO: implement this to actually work right
// this fn is a stub!
pub fn rangeOfIdentifier(source: *const Source, loc: logger.Loc) logger.Range {
+ const text = source.contents[loc.toUsize()..];
var r = logger.Range{ .loc = loc, .len = 0 };
- const offset = @intCast(usize, loc.start);
- var i: usize = 0;
- for (source.contents[offset..]) |c| {
- if (isIdentifierStart(@as(CodePoint, c))) {
- for (source.contents[offset + i ..]) |c_| {
- if (!isIdentifierContinue(c_)) {
- r.len = std.math.lossyCast(i32, i);
- return r;
+ if (text.len == 0) {
+ return r;
+ }
+
+ var iter = CodepointIterator{ .bytes = text, .i = 0 };
+ var c = @intCast(CodePoint, iter.nextCodepoint() orelse unreachable);
+
+ // Handle private names
+ if (c == '#') {
+ c = @intCast(CodePoint, iter.nextCodepoint() orelse {
+ r.len = 1;
+ return r;
+ });
+ }
+
+ if (isIdentifierStart(c) or c == '\\') {
+ defer r.len = @intCast(i32, iter.i);
+ while (iter.nextCodepoint()) |code_point| {
+ if (code_point == '\\') {
+ // Search for the end of the identifier
+
+ // Skip over bracketed unicode escapes such as "\u{10000}"
+ if (iter.i + 2 < text.len and text[iter.i + 1] == 'u' and text[iter.i + 2] == '{') {
+ iter.i += 2;
+ while (iter.i < text.len) {
+ if (text[iter.i] == '}') {
+ iter.i += 1;
+ break;
+ }
+ iter.i += 1;
+ }
}
- i += 1;
+ } else if (!isIdentifierContinue(code_point)) {
+ return r;
}
}
-
- i += 1;
}
+ // const offset = @intCast(usize, loc.start);
+ // var i: usize = 0;
+ // for (text) |c| {
+ // if (isIdentifierStart(@as(CodePoint, c))) {
+ // for (source.contents[offset + i ..]) |c_| {
+ // if (!isIdentifierContinue(c_)) {
+ // r.len = std.math.lossyCast(i32, i);
+ // return r;
+ // }
+ // i += 1;
+ // }
+ // }
+
+ // i += 1;
+ // }
+
return r;
}