aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jarred Sumner <jarred@jarredsumner.com> 2021-10-25 05:42:01 -0700
committerGravatar Jarred Sumner <jarred@jarredsumner.com> 2021-10-25 05:42:01 -0700
commit4e889c7b47bbfb5c638b24e02906964015d9b3f2 (patch)
treef38565c9a1636d10b1349f12d55c2f8717ad980d
parent2ed6605cc35adb8ee04d53d96e38617aa4597510 (diff)
downloadbun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.gz
bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.zst
bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.zip
Fix JSX unicode handling, slightly improve perf
-rw-r--r--src/javascript/jsc/bindings/bindings.zig4
-rw-r--r--src/js_lexer.zig152
-rw-r--r--src/js_printer.zig74
-rw-r--r--src/string_immutable.zig58
4 files changed, 137 insertions, 151 deletions
diff --git a/src/javascript/jsc/bindings/bindings.zig b/src/javascript/jsc/bindings/bindings.zig
index 554f26e35..026c283a9 100644
--- a/src/javascript/jsc/bindings/bindings.zig
+++ b/src/javascript/jsc/bindings/bindings.zig
@@ -110,6 +110,10 @@ pub const ZigString = extern struct {
return this.ptr[0..std.math.min(this.len, 4096)];
}
+ pub inline fn full(this: *const ZigString) []const u8 {
+ return this.ptr[0..this.len];
+ }
+
pub fn trimmedSlice(this: *const ZigString) []const u8 {
return std.mem.trim(u8, this.ptr[0..std.math.min(this.len, 4096)], " \r\n");
}
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index a966358b8..f5fe8cca4 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -212,14 +212,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier);
}
- pub inline fn stringLiteralUTF16(lexer: *LexerType) JavascriptString {
- if (lexer.string_literal_is_ascii) {
- return lexer.stringToUTF16(lexer.string_literal_slice);
- } else {
- return lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable;
- }
- }
-
pub fn deinit(this: *LexerType) void {}
fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
@@ -227,7 +219,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
defer buf_.* = buf;
if (comptime is_json) lexer.is_ascii_only = false;
- var iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 };
+ const iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 };
var iter = strings.CodepointIterator.Cursor{};
const start_length = buf.items.len;
while (iterator.next(&iter)) {
@@ -1747,7 +1739,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
if (lexer.string_literal_is_ascii) {
return js_ast.E.String{ .utf8 = lexer.string_literal_slice };
} else {
- return js_ast.E.String{ .value = lexer.stringLiteralUTF16() };
+ return js_ast.E.String{ .value = lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable };
}
}
@@ -1791,16 +1783,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
}
// TODO: use wtf-8 encoding.
- pub fn stringToUTF16(lexer: *LexerType, str: string) JavascriptString {
- var buf: JavascriptString = lexer.allocator.alloc(u16, std.mem.len(str)) catch unreachable;
- // theres prob a faster/better way
- for (str) |char, i| {
- buf[i] = char;
- }
- return buf;
- }
-
- // TODO: use wtf-8 encoding.
pub fn utf16ToStringWithValidation(lexer: *LexerType, js: JavascriptString) !string {
// return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js);
return utf16ToString(lexer, js);
@@ -2112,25 +2094,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
var decoded = jsx_decode_buf;
defer jsx_decode_buf = decoded;
var decoded_ptr = &decoded;
- var i: u32 = 0;
+
var after_last_non_whitespace: ?u32 = null;
// Trim whitespace off the end of the first line
var first_non_whitespace: ?u32 = 0;
- while (i < text.len) {
- const width: u3 = strings.utf8ByteSequenceLength(text[i]);
-
- const c: CodePoint = switch (width) {
- 0 => -1,
- 1 => @intCast(CodePoint, text[i]),
- 2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable),
- 3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable),
- 4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable),
- else => unreachable,
- };
+ const iterator = strings.CodepointIterator.init(text);
+ var cursor = strings.CodepointIterator.Cursor{};
- switch (c) {
+ while (iterator.next(&cursor)) {
+ switch (cursor.c) {
'\r', '\n', 0x2028, 0x2029 => {
if (first_non_whitespace != null and after_last_non_whitespace != null) {
// Newline
@@ -2148,15 +2122,14 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
'\t', ' ' => {},
else => {
// Check for unusual whitespace characters
- if (!isWhitespace(@intCast(CodePoint, c))) {
- after_last_non_whitespace = i + width;
+ if (!isWhitespace(cursor.c)) {
+ after_last_non_whitespace = cursor.i + @as(u32, cursor.width);
if (first_non_whitespace == null) {
- first_non_whitespace = i;
+ first_non_whitespace = cursor.i;
}
}
},
}
- i += width;
}
if (first_non_whitespace) |start| {
@@ -2171,25 +2144,13 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
}
pub fn decodeJSXEntities(lexer: *LexerType, text: string, out: *std.ArrayList(u16)) !void {
- var i: usize = 0;
- var buf = [4]u8{ 0, 0, 0, 0 };
-
- while (i < text.len) {
- const width: u3 = strings.utf8ByteSequenceLength(text[i]);
-
- var c: CodePoint = switch (width) {
- 0 => -1,
- 1 => @intCast(CodePoint, text[i]),
- 2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable),
- 3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable),
- 4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable),
- else => unreachable,
- };
- i += width;
-
- if (c == '&') {
- if (strings.indexOfChar(text[i..text.len], ';')) |length| {
- const entity = text[i .. i + length];
+ const iterator = strings.CodepointIterator.init(text);
+ var cursor = strings.CodepointIterator.Cursor{};
+
+ while (iterator.next(&cursor)) {
+ if (cursor.c == '&') {
+ if (strings.indexOfChar(text[cursor.i..], ';')) |length| {
+ const entity = text[cursor.i .. @as(usize, cursor.i) + length];
if (entity[0] == '#') {
var number = entity[1..entity.len];
var base: u8 = 10;
@@ -2197,22 +2158,32 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
number = number[1..number.len];
base = 16;
}
- c = try std.fmt.parseInt(i32, number, base);
- i += length + 1;
+ cursor.c = try std.fmt.parseInt(i32, number, base);
+ cursor.i += @intCast(u32, length) + 1;
+ cursor.width = 0;
} else if (tables.jsxEntity.get(entity)) |ent| {
- c = ent;
- i += length + 1;
+ cursor.c = ent;
+ cursor.i += @intCast(u32, length) + 1;
}
}
}
- if (c <= 0xFFFF) {
- try out.append(@intCast(u16, c));
+ if (cursor.c <= 0xFFFF) {
+ try out.append(@intCast(u16, cursor.c));
} else {
- c -= 0x1000;
+ cursor.c -= 0x10000;
try out.ensureUnusedCapacity(2);
- out.appendAssumeCapacity(@intCast(u16, 0xD800 + ((c >> 10) & 0x3FF)));
- out.appendAssumeCapacity(@intCast(u16, 0xDC00 + (c & 0x3FF)));
+ (out.items.ptr + out.items.len)[0..2].* = [_]u16{
+ @truncate(
+ u16,
+ @bitCast(u32, @as(i32, 0xD800) + ((cursor.c >> 10) & 0x3FF)),
+ ),
+ @truncate(
+ u16,
+ @bitCast(u32, @as(i32, 0xDC00) + (cursor.c & 0x3FF)),
+ ),
+ };
+ out.items = out.items.ptr[0 .. out.items.len + 2];
}
}
}
@@ -2663,7 +2634,7 @@ pub fn isIdentifier(text: string) bool {
return false;
}
- var iter = strings.CodepointIterator{ .bytes = text, .i = 0 };
+ const iter = strings.CodepointIterator{ .bytes = text, .i = 0 };
var cursor = strings.CodepointIterator.Cursor{};
if (!iter.next(&cursor)) return false;
@@ -2680,55 +2651,6 @@ pub fn isIdentifier(text: string) bool {
return true;
}
-pub const CodepointIterator = struct {
- bytes: []const u8,
- i: usize,
- width: u3 = 0,
- c: CodePoint = 0,
-
- pub fn nextCodepointSlice(it: *CodepointIterator) []const u8 {
- @setRuntimeSafety(false);
-
- const cp_len = strings.utf8ByteSequenceLength(it.bytes[it.i]);
- it.i += cp_len;
- // without branching,
-
- const slice = if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else "";
- it.width = @truncate(u3, slice.len);
- return slice;
- }
-
- pub fn nextCodepoint(it: *CodepointIterator) ?CodePoint {
- const slice = it.nextCodepointSlice();
- it.c = switch (it.width) {
- 0 => it.c,
- 1 => @as(CodePoint, slice[0]),
- 2 => @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable),
- 3 => @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable),
- 4 => @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable),
- else => unreachable,
- };
-
- return if (slice.len > 0) it.c else null;
- }
-
- /// Look ahead at the next n codepoints without advancing the iterator.
- /// If fewer than n codepoints are available, then return the remainder of the string.
- pub fn peek(it: *CodepointIterator, n: usize) []const u8 {
- const original_i = it.i;
- defer it.i = original_i;
-
- var end_ix = original_i;
- var found: usize = 0;
- while (found < n) : (found += 1) {
- const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
- end_ix += next_codepoint.len;
- }
-
- return it.bytes[original_i..end_ix];
- }
-};
-
pub fn isIdentifierUTF16(text: []const u16) bool {
const n = text.len;
if (n == 0) {
diff --git a/src/js_printer.zig b/src/js_printer.zig
index 868757b7d..0551305f6 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -37,10 +37,10 @@ const Ast = js_ast.Ast;
const hex_chars = "0123456789ABCDEF";
const first_ascii = 0x20;
const last_ascii = 0x7E;
-const first_high_surrogate: u21 = 0xD800;
-const last_high_surrogate: u21 = 0xDBFF;
-const first_low_surrogate: u21 = 0xDC00;
-const last_low_surrogate: u21 = 0xDFFF;
+const first_high_surrogate = 0xD800;
+const last_high_surrogate = 0xDBFF;
+const first_low_surrogate = 0xDC00;
+const last_low_surrogate = 0xDFFF;
const CodepointIterator = @import("./string_immutable.zig").UnsignedCodepointIterator;
const assert = std.debug.assert;
@@ -601,11 +601,10 @@ pub fn NewPrinter(
// e(text.len) catch unreachable;
while (i < n) {
- const CodeUnitType = u21;
+ const CodeUnitType = u32;
- const c = @as(CodeUnitType, text[i]);
+ const c: CodeUnitType = text[i];
i += 1;
- var r: CodeUnitType = 0;
var width: u3 = 0;
// TODO: here
@@ -726,18 +725,17 @@ pub fn NewPrinter(
else => {
switch (c) {
-
first_high_surrogate...last_high_surrogate => {
// Is there a next character?
if (i < n) {
- const c2: CodeUnitType = @as(CodeUnitType, text[i]);
+ const c2: CodeUnitType = text[i];
if (c2 >= first_high_surrogate and c2 <= last_low_surrogate) {
- // this is some magic to me
- r = (c << 10) + c2 + (0x10000 - (first_high_surrogate << 10) - first_low_surrogate);
i += 1;
+ const r: CodeUnitType = 0x10000 + (((c & 0x03ff) << 10) | (c2 & 0x03ff));
+
// Escape this character if UTF-8 isn't allowed
if (ascii_only) {
var ptr = e.writer.reserve(12) catch unreachable;
@@ -749,20 +747,18 @@ pub fn NewPrinter(
continue;
// Otherwise, encode to UTF-8
- } else {
- var ptr = e.writer.reserve(4) catch unreachable;
- e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r));
- continue;
}
+
+ var ptr = e.writer.reserve(4) catch unreachable;
+ e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r));
+ continue;
}
}
- {
- // Write an unpaired high surrogate
- var ptr = e.writer.reserve(6) catch unreachable;
- ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] };
- e.writer.advance(6);
- }
+ // Write an unpaired high surrogate
+ var ptr = e.writer.reserve(6) catch unreachable;
+ ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] };
+ e.writer.advance(6);
},
// Is this an unpaired low surrogate or four-digit hex escape?
first_low_surrogate...last_low_surrogate => {
@@ -3825,35 +3821,45 @@ pub fn NewPrinter(
}
pub fn printIdentifierUTF16(p: *Printer, name: []const u16) !void {
- var temp = [_]u8{ 0, 0, 0, 0, 0, 0 };
const n = name.len;
var i: usize = 0;
- while (i < n) : (i += 1) {
- var c: u21 = name[i];
- if (c >= first_high_surrogate and c <= last_high_surrogate and i + 1 < n) {
- const c2: u21 = name[i + 1];
- if (c2 >= first_low_surrogate and c2 <= last_low_surrogate) {
- c = (c << 10) + c2 + (0x10000 - (first_high_surrogate << 10) - first_low_surrogate);
- i += 1;
- }
+ const CodeUnitType = u32;
+ while (i < n) {
+ var c: CodeUnitType = name[i];
+ i += 1;
+
+ if (c & ~@as(CodeUnitType, 0x03ff) == 0xd800 and i < n) {
+ c = 0x10000 + (((c & 0x03ff) << 10) | (name[i] & 0x03ff));
}
if ((comptime ascii_only) and c > last_ascii) {
switch (c) {
0...0xFFFF => {
- p.print([_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] });
+ p.print(
+ [_]u8{
+ '\\',
+ 'u',
+ hex_chars[c >> 12],
+ hex_chars[(c >> 8) & 15],
+ hex_chars[(c >> 4) & 15],
+ hex_chars[c & 15],
+ },
+ );
},
else => {
p.print("\\u");
- p.print(std.fmt.bufPrintIntToSlice(&temp, c, 16, .upper, .{}));
+ var buf_ptr = p.writer.reserve(4) catch unreachable;
+ p.writer.advance(strings.encodeWTF8RuneT(buf_ptr[0..4], CodeUnitType, c));
},
}
continue;
}
- const width = try std.unicode.utf8Encode(c, &temp);
- p.print(temp[0..width]);
+ {
+ var buf_ptr = p.writer.reserve(4) catch unreachable;
+ p.writer.advance(strings.encodeWTF8RuneT(buf_ptr[0..4], CodeUnitType, c));
+ }
}
}
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 9bfd8df77..fe4c52a99 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -548,7 +548,7 @@ pub fn utf16EqlString(text: []const u16, str: string) bool {
// This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using
// WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info.
-pub fn encodeWTF8Rune(p: []u8, r: i32) u3 {
+pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 {
return @call(
.{
.modifier = .always_inline,
@@ -562,7 +562,7 @@ pub fn encodeWTF8Rune(p: []u8, r: i32) u3 {
);
}
-pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 {
+pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 {
switch (r) {
0...0x7F => {
p[0] = @intCast(u8, r);
@@ -589,6 +589,60 @@ pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 {
}
}
+pub fn codepointSize(comptime R: type, r: R) u3 {
+ return switch (r) {
+ 0b0000_0000...0b0111_1111 => 1,
+ 0b1100_0000...0b1101_1111 => 2,
+ 0b1110_0000...0b1110_1111 => 3,
+ 0b1111_0000...0b1111_0111 => 4,
+ else => 0,
+ };
+}
+
+// /// Encode Type into UTF-8 bytes.
+// /// - Invalid unicode data becomes U+FFFD REPLACEMENT CHARACTER.
+// /// -
+// pub fn encodeUTF8RuneT(out: *[4]u8, comptime R: type, c: R) u3 {
+// switch (c) {
+// 0b0000_0000...0b0111_1111 => {
+// out[0] = @intCast(u8, c);
+// return 1;
+// },
+// 0b1100_0000...0b1101_1111 => {
+// out[0] = @truncate(u8, 0b11000000 | (c >> 6));
+// out[1] = @truncate(u8, 0b10000000 | c & 0b111111);
+// return 2;
+// },
+
+// 0b1110_0000...0b1110_1111 => {
+// if (0xd800 <= c and c <= 0xdfff) {
+// // Replacement character
+// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD };
+
+// return 3;
+// }
+
+// out[0] = @truncate(u8, 0b11100000 | (c >> 12));
+// out[1] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111);
+// out[2] = @truncate(u8, 0b10000000 | c & 0b111111);
+// return 3;
+// },
+// 0b1111_0000...0b1111_0111 => {
+// out[0] = @truncate(u8, 0b11110000 | (c >> 18));
+// out[1] = @truncate(u8, 0b10000000 | (c >> 12) & 0b111111);
+// out[2] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111);
+// out[3] = @truncate(u8, 0b10000000 | c & 0b111111);
+// return 4;
+// },
+// else => {
+// // Replacement character
+// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD };
+
+// return 3;
+// },
+// }
+// }
+
pub fn containsNonBmpCodePoint(text: string) bool {
var iter = CodepointIterator.init(text);
var curs = CodepointIterator.Cursor{};