aboutsummaryrefslogtreecommitdiff
path: root/src/js_lexer.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/js_lexer.zig')
-rw-r--r--src/js_lexer.zig152
1 files changed, 37 insertions, 115 deletions
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index a966358b8..f5fe8cca4 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -212,14 +212,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier);
}
- pub inline fn stringLiteralUTF16(lexer: *LexerType) JavascriptString {
- if (lexer.string_literal_is_ascii) {
- return lexer.stringToUTF16(lexer.string_literal_slice);
- } else {
- return lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable;
- }
- }
-
pub fn deinit(this: *LexerType) void {}
fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
@@ -227,7 +219,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
defer buf_.* = buf;
if (comptime is_json) lexer.is_ascii_only = false;
- var iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 };
+ const iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 };
var iter = strings.CodepointIterator.Cursor{};
const start_length = buf.items.len;
while (iterator.next(&iter)) {
@@ -1747,7 +1739,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
if (lexer.string_literal_is_ascii) {
return js_ast.E.String{ .utf8 = lexer.string_literal_slice };
} else {
- return js_ast.E.String{ .value = lexer.stringLiteralUTF16() };
+ return js_ast.E.String{ .value = lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable };
}
}
@@ -1791,16 +1783,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
}
// TODO: use wtf-8 encoding.
- pub fn stringToUTF16(lexer: *LexerType, str: string) JavascriptString {
- var buf: JavascriptString = lexer.allocator.alloc(u16, std.mem.len(str)) catch unreachable;
- // theres prob a faster/better way
- for (str) |char, i| {
- buf[i] = char;
- }
- return buf;
- }
-
- // TODO: use wtf-8 encoding.
pub fn utf16ToStringWithValidation(lexer: *LexerType, js: JavascriptString) !string {
// return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js);
return utf16ToString(lexer, js);
@@ -2112,25 +2094,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
var decoded = jsx_decode_buf;
defer jsx_decode_buf = decoded;
var decoded_ptr = &decoded;
- var i: u32 = 0;
+
var after_last_non_whitespace: ?u32 = null;
// Trim whitespace off the end of the first line
var first_non_whitespace: ?u32 = 0;
- while (i < text.len) {
- const width: u3 = strings.utf8ByteSequenceLength(text[i]);
-
- const c: CodePoint = switch (width) {
- 0 => -1,
- 1 => @intCast(CodePoint, text[i]),
- 2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable),
- 3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable),
- 4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable),
- else => unreachable,
- };
+ const iterator = strings.CodepointIterator.init(text);
+ var cursor = strings.CodepointIterator.Cursor{};
- switch (c) {
+ while (iterator.next(&cursor)) {
+ switch (cursor.c) {
'\r', '\n', 0x2028, 0x2029 => {
if (first_non_whitespace != null and after_last_non_whitespace != null) {
// Newline
@@ -2148,15 +2122,14 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
'\t', ' ' => {},
else => {
// Check for unusual whitespace characters
- if (!isWhitespace(@intCast(CodePoint, c))) {
- after_last_non_whitespace = i + width;
+ if (!isWhitespace(cursor.c)) {
+ after_last_non_whitespace = cursor.i + @as(u32, cursor.width);
if (first_non_whitespace == null) {
- first_non_whitespace = i;
+ first_non_whitespace = cursor.i;
}
}
},
}
- i += width;
}
if (first_non_whitespace) |start| {
@@ -2171,25 +2144,13 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
}
pub fn decodeJSXEntities(lexer: *LexerType, text: string, out: *std.ArrayList(u16)) !void {
- var i: usize = 0;
- var buf = [4]u8{ 0, 0, 0, 0 };
-
- while (i < text.len) {
- const width: u3 = strings.utf8ByteSequenceLength(text[i]);
-
- var c: CodePoint = switch (width) {
- 0 => -1,
- 1 => @intCast(CodePoint, text[i]),
- 2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable),
- 3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable),
- 4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable),
- else => unreachable,
- };
- i += width;
-
- if (c == '&') {
- if (strings.indexOfChar(text[i..text.len], ';')) |length| {
- const entity = text[i .. i + length];
+ const iterator = strings.CodepointIterator.init(text);
+ var cursor = strings.CodepointIterator.Cursor{};
+
+ while (iterator.next(&cursor)) {
+ if (cursor.c == '&') {
+ if (strings.indexOfChar(text[cursor.i..], ';')) |length| {
+ const entity = text[cursor.i .. @as(usize, cursor.i) + length];
if (entity[0] == '#') {
var number = entity[1..entity.len];
var base: u8 = 10;
@@ -2197,22 +2158,32 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
number = number[1..number.len];
base = 16;
}
- c = try std.fmt.parseInt(i32, number, base);
- i += length + 1;
+ cursor.c = try std.fmt.parseInt(i32, number, base);
+ cursor.i += @intCast(u32, length) + 1;
+ cursor.width = 0;
} else if (tables.jsxEntity.get(entity)) |ent| {
- c = ent;
- i += length + 1;
+ cursor.c = ent;
+ cursor.i += @intCast(u32, length) + 1;
}
}
}
- if (c <= 0xFFFF) {
- try out.append(@intCast(u16, c));
+ if (cursor.c <= 0xFFFF) {
+ try out.append(@intCast(u16, cursor.c));
} else {
- c -= 0x1000;
+ cursor.c -= 0x10000;
try out.ensureUnusedCapacity(2);
- out.appendAssumeCapacity(@intCast(u16, 0xD800 + ((c >> 10) & 0x3FF)));
- out.appendAssumeCapacity(@intCast(u16, 0xDC00 + (c & 0x3FF)));
+ (out.items.ptr + out.items.len)[0..2].* = [_]u16{
+ @truncate(
+ u16,
+ @bitCast(u32, @as(i32, 0xD800) + ((cursor.c >> 10) & 0x3FF)),
+ ),
+ @truncate(
+ u16,
+ @bitCast(u32, @as(i32, 0xDC00) + (cursor.c & 0x3FF)),
+ ),
+ };
+ out.items = out.items.ptr[0 .. out.items.len + 2];
}
}
}
@@ -2663,7 +2634,7 @@ pub fn isIdentifier(text: string) bool {
return false;
}
- var iter = strings.CodepointIterator{ .bytes = text, .i = 0 };
+ const iter = strings.CodepointIterator{ .bytes = text, .i = 0 };
var cursor = strings.CodepointIterator.Cursor{};
if (!iter.next(&cursor)) return false;
@@ -2680,55 +2651,6 @@ pub fn isIdentifier(text: string) bool {
return true;
}
-pub const CodepointIterator = struct {
- bytes: []const u8,
- i: usize,
- width: u3 = 0,
- c: CodePoint = 0,
-
- pub fn nextCodepointSlice(it: *CodepointIterator) []const u8 {
- @setRuntimeSafety(false);
-
- const cp_len = strings.utf8ByteSequenceLength(it.bytes[it.i]);
- it.i += cp_len;
- // without branching,
-
- const slice = if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else "";
- it.width = @truncate(u3, slice.len);
- return slice;
- }
-
- pub fn nextCodepoint(it: *CodepointIterator) ?CodePoint {
- const slice = it.nextCodepointSlice();
- it.c = switch (it.width) {
- 0 => it.c,
- 1 => @as(CodePoint, slice[0]),
- 2 => @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable),
- 3 => @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable),
- 4 => @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable),
- else => unreachable,
- };
-
- return if (slice.len > 0) it.c else null;
- }
-
- /// Look ahead at the next n codepoints without advancing the iterator.
- /// If fewer than n codepoints are available, then return the remainder of the string.
- pub fn peek(it: *CodepointIterator, n: usize) []const u8 {
- const original_i = it.i;
- defer it.i = original_i;
-
- var end_ix = original_i;
- var found: usize = 0;
- while (found < n) : (found += 1) {
- const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
- end_ix += next_codepoint.len;
- }
-
- return it.bytes[original_i..end_ix];
- }
-};
-
pub fn isIdentifierUTF16(text: []const u16) bool {
const n = text.len;
if (n == 0) {