diff options
author | 2021-10-25 05:42:01 -0700 | |
---|---|---|
committer | 2021-10-25 05:42:01 -0700 | |
commit | 4e889c7b47bbfb5c638b24e02906964015d9b3f2 (patch) | |
tree | f38565c9a1636d10b1349f12d55c2f8717ad980d | |
parent | 2ed6605cc35adb8ee04d53d96e38617aa4597510 (diff) | |
download | bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.gz bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.zst bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.zip |
Fix JSX unicode handling, slightly improve perf
-rw-r--r-- | src/javascript/jsc/bindings/bindings.zig | 4 | ||||
-rw-r--r-- | src/js_lexer.zig | 152 | ||||
-rw-r--r-- | src/js_printer.zig | 74 | ||||
-rw-r--r-- | src/string_immutable.zig | 58 |
4 files changed, 137 insertions, 151 deletions
diff --git a/src/javascript/jsc/bindings/bindings.zig b/src/javascript/jsc/bindings/bindings.zig index 554f26e35..026c283a9 100644 --- a/src/javascript/jsc/bindings/bindings.zig +++ b/src/javascript/jsc/bindings/bindings.zig @@ -110,6 +110,10 @@ pub const ZigString = extern struct { return this.ptr[0..std.math.min(this.len, 4096)]; } + pub inline fn full(this: *const ZigString) []const u8 { + return this.ptr[0..this.len]; + } + pub fn trimmedSlice(this: *const ZigString) []const u8 { return std.mem.trim(u8, this.ptr[0..std.math.min(this.len, 4096)], " \r\n"); } diff --git a/src/js_lexer.zig b/src/js_lexer.zig index a966358b8..f5fe8cca4 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -212,14 +212,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier); } - pub inline fn stringLiteralUTF16(lexer: *LexerType) JavascriptString { - if (lexer.string_literal_is_ascii) { - return lexer.stringToUTF16(lexer.string_literal_slice); - } else { - return lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable; - } - } - pub fn deinit(this: *LexerType) void {} fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { @@ -227,7 +219,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { defer buf_.* = buf; if (comptime is_json) lexer.is_ascii_only = false; - var iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 }; + const iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 }; var iter = strings.CodepointIterator.Cursor{}; const start_length = buf.items.len; while (iterator.next(&iter)) { @@ -1747,7 +1739,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { if (lexer.string_literal_is_ascii) { return js_ast.E.String{ .utf8 = lexer.string_literal_slice }; } else { - return js_ast.E.String{ .value = lexer.stringLiteralUTF16() }; + return js_ast.E.String{ .value = lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable }; } } @@ -1791,16 +1783,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } // TODO: use wtf-8 encoding. - pub fn stringToUTF16(lexer: *LexerType, str: string) JavascriptString { - var buf: JavascriptString = lexer.allocator.alloc(u16, std.mem.len(str)) catch unreachable; - // theres prob a faster/better way - for (str) |char, i| { - buf[i] = char; - } - return buf; - } - - // TODO: use wtf-8 encoding. pub fn utf16ToStringWithValidation(lexer: *LexerType, js: JavascriptString) !string { // return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js); return utf16ToString(lexer, js); @@ -2112,25 +2094,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { var decoded = jsx_decode_buf; defer jsx_decode_buf = decoded; var decoded_ptr = &decoded; - var i: u32 = 0; + var after_last_non_whitespace: ?u32 = null; // Trim whitespace off the end of the first line var first_non_whitespace: ?u32 = 0; - while (i < text.len) { - const width: u3 = strings.utf8ByteSequenceLength(text[i]); - - const c: CodePoint = switch (width) { - 0 => -1, - 1 => @intCast(CodePoint, text[i]), - 2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable), - 3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable), - 4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable), - else => unreachable, - }; + const iterator = strings.CodepointIterator.init(text); + var cursor = strings.CodepointIterator.Cursor{}; - switch (c) { + while (iterator.next(&cursor)) { + switch (cursor.c) { '\r', '\n', 0x2028, 0x2029 => { if (first_non_whitespace != null and after_last_non_whitespace != null) { // Newline @@ -2148,15 +2122,14 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { '\t', ' ' => {}, else => { // Check for unusual whitespace characters - if (!isWhitespace(@intCast(CodePoint, c))) { - after_last_non_whitespace = i + width; + if (!isWhitespace(cursor.c)) { + after_last_non_whitespace = cursor.i + @as(u32, cursor.width); if (first_non_whitespace == null) { - first_non_whitespace = i; + first_non_whitespace = cursor.i; } } }, } - i += width; } if (first_non_whitespace) |start| { @@ -2171,25 +2144,13 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } pub fn decodeJSXEntities(lexer: *LexerType, text: string, out: *std.ArrayList(u16)) !void { - var i: usize = 0; - var buf = [4]u8{ 0, 0, 0, 0 }; - - while (i < text.len) { - const width: u3 = strings.utf8ByteSequenceLength(text[i]); - - var c: CodePoint = switch (width) { - 0 => -1, - 1 => @intCast(CodePoint, text[i]), - 2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable), - 3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable), - 4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable), - else => unreachable, - }; - i += width; - - if (c == '&') { - if (strings.indexOfChar(text[i..text.len], ';')) |length| { - const entity = text[i .. i + length]; + const iterator = strings.CodepointIterator.init(text); + var cursor = strings.CodepointIterator.Cursor{}; + + while (iterator.next(&cursor)) { + if (cursor.c == '&') { + if (strings.indexOfChar(text[cursor.i..], ';')) |length| { + const entity = text[cursor.i .. @as(usize, cursor.i) + length]; if (entity[0] == '#') { var number = entity[1..entity.len]; var base: u8 = 10; @@ -2197,22 +2158,32 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { number = number[1..number.len]; base = 16; } - c = try std.fmt.parseInt(i32, number, base); - i += length + 1; + cursor.c = try std.fmt.parseInt(i32, number, base); + cursor.i += @intCast(u32, length) + 1; + cursor.width = 0; } else if (tables.jsxEntity.get(entity)) |ent| { - c = ent; - i += length + 1; + cursor.c = ent; + cursor.i += @intCast(u32, length) + 1; } } } - if (c <= 0xFFFF) { - try out.append(@intCast(u16, c)); + if (cursor.c <= 0xFFFF) { + try out.append(@intCast(u16, cursor.c)); } else { - c -= 0x1000; + cursor.c -= 0x10000; try out.ensureUnusedCapacity(2); - out.appendAssumeCapacity(@intCast(u16, 0xD800 + ((c >> 10) & 0x3FF))); - out.appendAssumeCapacity(@intCast(u16, 0xDC00 + (c & 0x3FF))); + (out.items.ptr + out.items.len)[0..2].* = [_]u16{ + @truncate( + u16, + @bitCast(u32, @as(i32, 0xD800) + ((cursor.c >> 10) & 0x3FF)), + ), + @truncate( + u16, + @bitCast(u32, @as(i32, 0xDC00) + (cursor.c & 0x3FF)), + ), + }; + out.items = out.items.ptr[0 .. out.items.len + 2]; } } } @@ -2663,7 +2634,7 @@ pub fn isIdentifier(text: string) bool { return false; } - var iter = strings.CodepointIterator{ .bytes = text, .i = 0 }; + const iter = strings.CodepointIterator{ .bytes = text, .i = 0 }; var cursor = strings.CodepointIterator.Cursor{}; if (!iter.next(&cursor)) return false; @@ -2680,55 +2651,6 @@ pub fn isIdentifier(text: string) bool { return true; } -pub const CodepointIterator = struct { - bytes: []const u8, - i: usize, - width: u3 = 0, - c: CodePoint = 0, - - pub fn nextCodepointSlice(it: *CodepointIterator) []const u8 { - @setRuntimeSafety(false); - - const cp_len = strings.utf8ByteSequenceLength(it.bytes[it.i]); - it.i += cp_len; - // without branching, - - const slice = if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else ""; - it.width = @truncate(u3, slice.len); - return slice; - } - - pub fn nextCodepoint(it: *CodepointIterator) ?CodePoint { - const slice = it.nextCodepointSlice(); - it.c = switch (it.width) { - 0 => it.c, - 1 => @as(CodePoint, slice[0]), - 2 => @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable), - 3 => @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable), - 4 => @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable), - else => unreachable, - }; - - return if (slice.len > 0) it.c else null; - } - - /// Look ahead at the next n codepoints without advancing the iterator. - /// If fewer than n codepoints are available, then return the remainder of the string. - pub fn peek(it: *CodepointIterator, n: usize) []const u8 { - const original_i = it.i; - defer it.i = original_i; - - var end_ix = original_i; - var found: usize = 0; - while (found < n) : (found += 1) { - const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..]; - end_ix += next_codepoint.len; - } - - return it.bytes[original_i..end_ix]; - } -}; - pub fn isIdentifierUTF16(text: []const u16) bool { const n = text.len; if (n == 0) { diff --git a/src/js_printer.zig b/src/js_printer.zig index 868757b7d..0551305f6 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -37,10 +37,10 @@ const Ast = js_ast.Ast; const hex_chars = "0123456789ABCDEF"; const first_ascii = 0x20; const last_ascii = 0x7E; -const first_high_surrogate: u21 = 0xD800; -const last_high_surrogate: u21 = 0xDBFF; -const first_low_surrogate: u21 = 0xDC00; -const last_low_surrogate: u21 = 0xDFFF; +const first_high_surrogate = 0xD800; +const last_high_surrogate = 0xDBFF; +const first_low_surrogate = 0xDC00; +const last_low_surrogate = 0xDFFF; const CodepointIterator = @import("./string_immutable.zig").UnsignedCodepointIterator; const assert = std.debug.assert; @@ -601,11 +601,10 @@ pub fn NewPrinter( // e(text.len) catch unreachable; while (i < n) { - const CodeUnitType = u21; + const CodeUnitType = u32; - const c = @as(CodeUnitType, text[i]); + const c: CodeUnitType = text[i]; i += 1; - var r: CodeUnitType = 0; var width: u3 = 0; // TODO: here @@ -726,18 +725,17 @@ pub fn NewPrinter( else => { switch (c) { - first_high_surrogate...last_high_surrogate => { // Is there a next character? if (i < n) { - const c2: CodeUnitType = @as(CodeUnitType, text[i]); + const c2: CodeUnitType = text[i]; if (c2 >= first_high_surrogate and c2 <= last_low_surrogate) { - // this is some magic to me - r = (c << 10) + c2 + (0x10000 - (first_high_surrogate << 10) - first_low_surrogate); i += 1; + const r: CodeUnitType = 0x10000 + (((c & 0x03ff) << 10) | (c2 & 0x03ff)); + // Escape this character if UTF-8 isn't allowed if (ascii_only) { var ptr = e.writer.reserve(12) catch unreachable; @@ -749,20 +747,18 @@ pub fn NewPrinter( continue; // Otherwise, encode to UTF-8 - } else { - var ptr = e.writer.reserve(4) catch unreachable; - e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r)); - continue; } + + var ptr = e.writer.reserve(4) catch unreachable; + e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r)); + continue; } } - { - // Write an unpaired high surrogate - var ptr = e.writer.reserve(6) catch unreachable; - ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; - e.writer.advance(6); - } + // Write an unpaired high surrogate + var ptr = e.writer.reserve(6) catch unreachable; + ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; + e.writer.advance(6); }, // Is this an unpaired low surrogate or four-digit hex escape? first_low_surrogate...last_low_surrogate => { @@ -3825,35 +3821,45 @@ pub fn NewPrinter( } pub fn printIdentifierUTF16(p: *Printer, name: []const u16) !void { - var temp = [_]u8{ 0, 0, 0, 0, 0, 0 }; const n = name.len; var i: usize = 0; - while (i < n) : (i += 1) { - var c: u21 = name[i]; - if (c >= first_high_surrogate and c <= last_high_surrogate and i + 1 < n) { - const c2: u21 = name[i + 1]; - if (c2 >= first_low_surrogate and c2 <= last_low_surrogate) { - c = (c << 10) + c2 + (0x10000 - (first_high_surrogate << 10) - first_low_surrogate); - i += 1; - } + const CodeUnitType = u32; + while (i < n) { + var c: CodeUnitType = name[i]; + i += 1; + + if (c & ~@as(CodeUnitType, 0x03ff) == 0xd800 and i < n) { + c = 0x10000 + (((c & 0x03ff) << 10) | (name[i] & 0x03ff)); } if ((comptime ascii_only) and c > last_ascii) { switch (c) { 0...0xFFFF => { - p.print([_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }); + p.print( + [_]u8{ + '\\', + 'u', + hex_chars[c >> 12], + hex_chars[(c >> 8) & 15], + hex_chars[(c >> 4) & 15], + hex_chars[c & 15], + }, + ); }, else => { p.print("\\u"); - p.print(std.fmt.bufPrintIntToSlice(&temp, c, 16, .upper, .{})); + var buf_ptr = p.writer.reserve(4) catch unreachable; + p.writer.advance(strings.encodeWTF8RuneT(buf_ptr[0..4], CodeUnitType, c)); }, } continue; } - const width = try std.unicode.utf8Encode(c, &temp); - p.print(temp[0..width]); + { + var buf_ptr = p.writer.reserve(4) catch unreachable; + p.writer.advance(strings.encodeWTF8RuneT(buf_ptr[0..4], CodeUnitType, c)); + } } } diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 9bfd8df77..fe4c52a99 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -548,7 +548,7 @@ pub fn utf16EqlString(text: []const u16, str: string) bool { // This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info. -pub fn encodeWTF8Rune(p: []u8, r: i32) u3 { +pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 { return @call( .{ .modifier = .always_inline, @@ -562,7 +562,7 @@ pub fn encodeWTF8Rune(p: []u8, r: i32) u3 { ); } -pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 { +pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 { switch (r) { 0...0x7F => { p[0] = @intCast(u8, r); @@ -589,6 +589,60 @@ pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 { } } +pub fn codepointSize(comptime R: type, r: R) u3 { + return switch (r) { + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => 0, + }; +} + +// /// Encode Type into UTF-8 bytes. +// /// - Invalid unicode data becomes U+FFFD REPLACEMENT CHARACTER. +// /// - +// pub fn encodeUTF8RuneT(out: *[4]u8, comptime R: type, c: R) u3 { +// switch (c) { +// 0b0000_0000...0b0111_1111 => { +// out[0] = @intCast(u8, c); +// return 1; +// }, +// 0b1100_0000...0b1101_1111 => { +// out[0] = @truncate(u8, 0b11000000 | (c >> 6)); +// out[1] = @truncate(u8, 0b10000000 | c & 0b111111); +// return 2; +// }, + +// 0b1110_0000...0b1110_1111 => { +// if (0xd800 <= c and c <= 0xdfff) { +// // Replacement character +// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD }; + +// return 3; +// } + +// out[0] = @truncate(u8, 0b11100000 | (c >> 12)); +// out[1] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111); +// out[2] = @truncate(u8, 0b10000000 | c & 0b111111); +// return 3; +// }, +// 0b1111_0000...0b1111_0111 => { +// out[0] = @truncate(u8, 0b11110000 | (c >> 18)); +// out[1] = @truncate(u8, 0b10000000 | (c >> 12) & 0b111111); +// out[2] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111); +// out[3] = @truncate(u8, 0b10000000 | c & 0b111111); +// return 4; +// }, +// else => { +// // Replacement character +// out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD }; + +// return 3; +// }, +// } +// } + pub fn containsNonBmpCodePoint(text: string) bool { var iter = CodepointIterator.init(text); var curs = CodepointIterator.Cursor{}; |