Fix JSX unicode handling, slightly improve perf

author: Jarred Sumner <jarred@jarredsumner.com> 2021-10-25 05:42:01 -0700
committer: Jarred Sumner <jarred@jarredsumner.com> 2021-10-25 05:42:01 -0700
commit: 4e889c7b47bbfb5c638b24e02906964015d9b3f2 (patch)
tree: f38565c9a1636d10b1349f12d55c2f8717ad980d
parent: 2ed6605cc35adb8ee04d53d96e38617aa4597510 (diff)
download: bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.gz
bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.zst
bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.zip
4 files changed, 137 insertions, 151 deletions
diff --git a/src/javascript/jsc/bindings/bindings.zig b/src/javascript/jsc/bindings/bindings.zig
index 554f26e35..026c283a9 100644
--- a/src/javascript/jsc/bindings/bindings.zig
+++ b/src/javascript/jsc/bindings/bindings.zig
@@ -110,6 +110,10 @@ pub const ZigString = extern struct {
         return this.ptr[0..std.math.min(this.len, 4096)];
     }
 
+    pub inline fn full(this: *const ZigString) []const u8 {
+        return this.ptr[0..this.len];
+    }
+
     pub fn trimmedSlice(this: *const ZigString) []const u8 {
         return std.mem.trim(u8, this.ptr[0..std.math.min(this.len, 4096)], " \r\n");
     }
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index a966358b8..f5fe8cca4 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -212,14 +212,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
             return @enumToInt(lexer.token) >= @enumToInt(T.t_identifier);
         }
 
-        pub inline fn stringLiteralUTF16(lexer: *LexerType) JavascriptString {
-            if (lexer.string_literal_is_ascii) {
-                return lexer.stringToUTF16(lexer.string_literal_slice);
-            } else {
-                return lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable;
-            }
-        }
-
         pub fn deinit(this: *LexerType) void {}
 
         fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
@@ -227,7 +219,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
             defer buf_.* = buf;
             if (comptime is_json) lexer.is_ascii_only = false;
 
-            var iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 };
+            const iterator = strings.CodepointIterator{ .bytes = text[start..], .i = 0 };
             var iter = strings.CodepointIterator.Cursor{};
             const start_length = buf.items.len;
             while (iterator.next(&iter)) {
@@ -1747,7 +1739,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
             if (lexer.string_literal_is_ascii) {
                 return js_ast.E.String{ .utf8 = lexer.string_literal_slice };
             } else {
-                return js_ast.E.String{ .value = lexer.stringLiteralUTF16() };
+                return js_ast.E.String{ .value = lexer.allocator.dupe(u16, lexer.string_literal) catch unreachable };
             }
         }
 
@@ -1791,16 +1783,6 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
         }
 
         // TODO: use wtf-8 encoding.
-        pub fn stringToUTF16(lexer: *LexerType, str: string) JavascriptString {
-            var buf: JavascriptString = lexer.allocator.alloc(u16, std.mem.len(str)) catch unreachable;
-            // theres prob a faster/better way
-            for (str) |char, i| {
-                buf[i] = char;
-            }
-            return buf;
-        }
-
-        // TODO: use wtf-8 encoding.
         pub fn utf16ToStringWithValidation(lexer: *LexerType, js: JavascriptString) !string {
             // return std.unicode.utf16leToUtf8Alloc(lexer.allocator, js);
             return utf16ToString(lexer, js);
@@ -2112,25 +2094,17 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
             var decoded = jsx_decode_buf;
             defer jsx_decode_buf = decoded;
             var decoded_ptr = &decoded;
-            var i: u32 = 0;
+
             var after_last_non_whitespace: ?u32 = null;
 
             // Trim whitespace off the end of the first line
             var first_non_whitespace: ?u32 = 0;
 
-            while (i < text.len) {
-                const width: u3 = strings.utf8ByteSequenceLength(text[i]);
-
-                const c: CodePoint = switch (width) {
-                    0 => -1,
-                    1 => @intCast(CodePoint, text[i]),
-                    2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable),
-                    3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable),
-                    4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable),
-                    else => unreachable,
-                };
+            const iterator = strings.CodepointIterator.init(text);
+            var cursor = strings.CodepointIterator.Cursor{};
 
-                switch (c) {
+            while (iterator.next(&cursor)) {
+                switch (cursor.c) {
                     '\r', '\n', 0x2028, 0x2029 => {
                         if (first_non_whitespace != null and after_last_non_whitespace != null) {
                             // Newline
@@ -2148,15 +2122,14 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
                     '\t', ' ' => {},
                     else => {
                         // Check for unusual whitespace characters
-                        if (!isWhitespace(@intCast(CodePoint, c))) {
-                            after_last_non_whitespace = i + width;
+                        if (!isWhitespace(cursor.c)) {
+                            after_last_non_whitespace = cursor.i + @as(u32, cursor.width);
                             if (first_non_whitespace == null) {
-                                first_non_whitespace = i;
+                                first_non_whitespace = cursor.i;
                             }
                         }
                     },
                 }
-                i += width;
             }
 
             if (first_non_whitespace) |start| {
@@ -2171,25 +2144,13 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
         }
 
         pub fn decodeJSXEntities(lexer: *LexerType, text: string, out: *std.ArrayList(u16)) !void {
-            var i: usize = 0;
-            var buf = [4]u8{ 0, 0, 0, 0 };
-
-            while (i < text.len) {
-                const width: u3 = strings.utf8ByteSequenceLength(text[i]);
-
-                var c: CodePoint = switch (width) {
-                    0 => -1,
-                    1 => @intCast(CodePoint, text[i]),
-                    2 => @intCast(CodePoint, std.unicode.utf8Decode2(text[i..][0..2]) catch unreachable),
-                    3 => @intCast(CodePoint, std.unicode.utf8Decode3(text[i..][0..3]) catch unreachable),
-                    4 => @intCast(CodePoint, std.unicode.utf8Decode4(text[i..][0..4]) catch unreachable),
-                    else => unreachable,
-                };
-                i += width;
-
-                if (c == '&') {
-                    if (strings.indexOfChar(text[i..text.len], ';')) |length| {
-                        const entity = text[i .. i + length];
+            const iterator = strings.CodepointIterator.init(text);
+            var cursor = strings.CodepointIterator.Cursor{};
+
+            while (iterator.next(&cursor)) {
+                if (cursor.c == '&') {
+                    if (strings.indexOfChar(text[cursor.i..], ';')) |length| {
+                        const entity = text[cursor.i .. @as(usize, cursor.i) + length];
                         if (entity[0] == '#') {
                             var number = entity[1..entity.len];
                             var base: u8 = 10;
@@ -2197,22 +2158,32 @@ pub fn NewLexer(comptime json_options: JSONOptions) type {
                                 number = number[1..number.len];
                                 base = 16;
                             }
-                            c = try std.fmt.parseInt(i32, number, base);
-                            i += length + 1;
+                            cursor.c = try std.fmt.parseInt(i32, number, base);
+                            cursor.i += @intCast(u32, length) + 1;
+                            cursor.width = 0;
                         } else if (tables.jsxEntity.get(entity)) |ent| {
-                            c = ent;
-                            i += length + 1;
+                            cursor.c = ent;
+                            cursor.i += @intCast(u32, length) + 1;
                         }
                     }
                 }
 
-                if (c <= 0xFFFF) {
-                    try out.append(@intCast(u16, c));
+                if (cursor.c <= 0xFFFF) {
+                    try out.append(@intCast(u16, cursor.c));
                 } else {
-                    c -= 0x1000;
+                    cursor.c -= 0x10000;
                     try out.ensureUnusedCapacity(2);
-                    out.appendAssumeCapacity(@intCast(u16, 0xD800 + ((c >> 10) & 0x3FF)));
-                    out.appendAssumeCapacity(@intCast(u16, 0xDC00 + (c & 0x3FF)));
+                    (out.items.ptr + out.items.len)[0..2].* = [_]u16{
+                        @truncate(
+                            u16,
+                            @bitCast(u32, @as(i32, 0xD800) + ((cursor.c >> 10) & 0x3FF)),
+                        ),
+                        @truncate(
+                            u16,
+                            @bitCast(u32, @as(i32, 0xDC00) + (cursor.c & 0x3FF)),
+                        ),
+                    };
+                    out.items = out.items.ptr[0 .. out.items.len + 2];
                 }
             }
         }
@@ -2663,7 +2634,7 @@ pub fn isIdentifier(text: string) bool {
         return false;
     }
 
-    var iter = strings.CodepointIterator{ .bytes = text, .i = 0 };
+    const iter = strings.CodepointIterator{ .bytes = text, .i = 0 };
     var cursor = strings.CodepointIterator.Cursor{};
     if (!iter.next(&cursor)) return false;
 
@@ -2680,55 +2651,6 @@ pub fn isIdentifier(text: string) bool {
     return true;
 }
 
-pub const CodepointIterator = struct {
-    bytes: []const u8,
-    i: usize,
-    width: u3 = 0,
-    c: CodePoint = 0,
-
-    pub fn nextCodepointSlice(it: *CodepointIterator) []const u8 {
-        @setRuntimeSafety(false);
-
-        const cp_len = strings.utf8ByteSequenceLength(it.bytes[it.i]);
-        it.i += cp_len;
-        // without branching,
-
-        const slice = if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else "";
-        it.width = @truncate(u3, slice.len);
-        return slice;
-    }
-
-    pub fn nextCodepoint(it: *CodepointIterator) ?CodePoint {
-        const slice = it.nextCodepointSlice();
-        it.c = switch (it.width) {
-            0 => it.c,
-            1 => @as(CodePoint, slice[0]),
-            2 => @as(CodePoint, unicode.utf8Decode2(slice) catch unreachable),
-            3 => @as(CodePoint, unicode.utf8Decode3(slice) catch unreachable),
-            4 => @as(CodePoint, unicode.utf8Decode4(slice) catch unreachable),
-            else => unreachable,
-        };
-
-        return if (slice.len > 0) it.c else null;
-    }
-
-    /// Look ahead at the next n codepoints without advancing the iterator.
-    /// If fewer than n codepoints are available, then return the remainder of the string.
-    pub fn peek(it: *CodepointIterator, n: usize) []const u8 {
-        const original_i = it.i;
-        defer it.i = original_i;
-
-        var end_ix = original_i;
-        var found: usize = 0;
-        while (found < n) : (found += 1) {
-            const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];
-            end_ix += next_codepoint.len;
-        }
-
-        return it.bytes[original_i..end_ix];
-    }
-};
-
 pub fn isIdentifierUTF16(text: []const u16) bool {
     const n = text.len;
     if (n == 0) {
diff --git a/src/js_printer.zig b/src/js_printer.zig
index 868757b7d..0551305f6 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -37,10 +37,10 @@ const Ast = js_ast.Ast;
 const hex_chars = "0123456789ABCDEF";
 const first_ascii = 0x20;
 const last_ascii = 0x7E;
-const first_high_surrogate: u21 = 0xD800;
-const last_high_surrogate: u21 = 0xDBFF;
-const first_low_surrogate: u21 = 0xDC00;
-const last_low_surrogate: u21 = 0xDFFF;
+const first_high_surrogate = 0xD800;
+const last_high_surrogate = 0xDBFF;
+const first_low_surrogate = 0xDC00;
+const last_low_surrogate = 0xDFFF;
 const CodepointIterator = @import("./string_immutable.zig").UnsignedCodepointIterator;
 const assert = std.debug.assert;
 
@@ -601,11 +601,10 @@ pub fn NewPrinter(
             // e(text.len) catch unreachable;
 
             while (i < n) {
-                const CodeUnitType = u21;
+                const CodeUnitType = u32;
 
-                const c = @as(CodeUnitType, text[i]);
+                const c: CodeUnitType = text[i];
                 i += 1;
-                var r: CodeUnitType = 0;
                 var width: u3 = 0;
 
                 // TODO: here
@@ -726,18 +725,17 @@ pub fn NewPrinter(
 
                     else => {
                         switch (c) {
-                            
                             first_high_surrogate...last_high_surrogate => {
 
                                 // Is there a next character?
 
                                 if (i < n) {
-                                    const c2: CodeUnitType = @as(CodeUnitType, text[i]);
+                                    const c2: CodeUnitType = text[i];
 
                                     if (c2 >= first_high_surrogate and c2 <= last_low_surrogate) {
-                                        // this is some magic to me
-                                        r = (c << 10) + c2 + (0x10000 - (first_high_surrogate << 10) - first_low_surrogate);
                                         i += 1;
+                                        const r: CodeUnitType = 0x10000 + (((c & 0x03ff) << 10) | (c2 & 0x03ff));
+
                                         // Escape this character if UTF-8 isn't allowed
                                         if (ascii_only) {
                                             var ptr = e.writer.reserve(12) catch unreachable;
@@ -749,20 +747,18 @@ pub fn NewPrinter(
 
                                             continue;
                                             // Otherwise, encode to UTF-8
-                                        } else {
-                                            var ptr = e.writer.reserve(4) catch unreachable;
-                                            e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r));
-                                            continue;
                                         }
+
+                                        var ptr = e.writer.reserve(4) catch unreachable;
+                                        e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r));
+                                        continue;
                                     }
                                 }
 
-                                {
-                                    // Write an unpaired high surrogate
-                                    var ptr = e.writer.reserve(6) catch unreachable;
-                                    ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] };
-                                    e.writer.advance(6);
-                                }
+                                // Write an unpaired high surrogate
+                                var ptr = e.writer.reserve(6) catch unreachable;
+                                ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] };
+                                e.writer.advance(6);
                             },
                             // Is this an unpaired low surrogate or four-digit hex escape?
                             first_low_surrogate...last_low_surrogate => {
@@ -3825,35 +3821,45 @@ pub fn NewPrinter(
         }
 
         pub fn printIdentifierUTF16(p: *Printer, name: []const u16) !void {
-            var temp = [_]u8{ 0, 0, 0, 0, 0, 0 };
             const n = name.len;
             var i: usize = 0;
-            while (i < n) : (i += 1) {
-                var c: u21 = name[i];
 
-                if (c >= first_high_surrogate and c <= last_high_surrogate and i + 1 < n) {
-                    const c2: u21 = name[i + 1];
-                    if (c2 >= first_low_surrogate and c2 <= last_low_surrogate) {
-                        c = (c << 10) + c2 + (0x10000 - (first_high_surrogate << 10) - first_low_surrogate);
-                        i += 1;
-                    }
+            const CodeUnitType = u32;
+            while (i < n) {
+                var c: CodeUnitType = name[i];
+                i += 1;
+
+                if (c & ~@as(CodeUnitType, 0x03ff) == 0xd800 and i < n) {
+                    c = 0x10000 + (((c & 0x03ff) << 10) | (name[i] & 0x03ff));
                 }
 
                 if ((comptime ascii_only) and c > last_ascii) {
                     switch (c) {
                         0...0xFFFF => {
-                            p.print([_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] });
+                            p.print(
+                                [_]u8{
+                                    '\\',
+                                    'u',
+                                    hex_chars[c >> 12],
+                                    hex_chars[(c >> 8) & 15],
+                                    hex_chars[(c >> 4) & 15],
+                                    hex_chars[c & 15],
+                                },
+                            );
                         },
                         else => {
                             p.print("\\u");
-                            p.print(std.fmt.bufPrintIntToSlice(&temp, c, 16, .upper, .{}));
+                            var buf_ptr = p.writer.reserve(4) catch unreachable;
+                            p.writer.advance(strings.encodeWTF8RuneT(buf_ptr[0..4], CodeUnitType, c));
                         },
                     }
                     continue;
                 }
 
-                const width = try std.unicode.utf8Encode(c, &temp);
-                p.print(temp[0..width]);
+                {
+                    var buf_ptr = p.writer.reserve(4) catch unreachable;
+                    p.writer.advance(strings.encodeWTF8RuneT(buf_ptr[0..4], CodeUnitType, c));
+                }
             }
         }
 
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 9bfd8df77..fe4c52a99 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -548,7 +548,7 @@ pub fn utf16EqlString(text: []const u16, str: string) bool {
 
 // This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using
 // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info.
-pub fn encodeWTF8Rune(p: []u8, r: i32) u3 {
+pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 {
     return @call(
         .{
             .modifier = .always_inline,
@@ -562,7 +562,7 @@ pub fn encodeWTF8Rune(p: []u8, r: i32) u3 {
     );
 }
 
-pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 {
+pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 {
     switch (r) {
         0...0x7F => {
             p[0] = @intCast(u8, r);
@@ -589,6 +589,60 @@ pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 {
     }
 }
 
+pub fn codepointSize(comptime R: type, r: R) u3 {
+    return switch (r) {
+        0b0000_0000...0b0111_1111 => 1,
+        0b1100_0000...0b1101_1111 => 2,
+        0b1110_0000...0b1110_1111 => 3,
+        0b1111_0000...0b1111_0111 => 4,
+        else => 0,
+    };
+}
+
+// /// Encode Type into UTF-8 bytes.
+// /// - Invalid unicode data becomes U+FFFD REPLACEMENT CHARACTER.
+// /// -
+// pub fn encodeUTF8RuneT(out: *[4]u8, comptime R: type, c: R) u3 {
+//     switch (c) {
+//         0b0000_0000...0b0111_1111 => {
+//             out[0] = @intCast(u8, c);
+//             return 1;
+//         },
+//         0b1100_0000...0b1101_1111 => {
+//             out[0] = @truncate(u8, 0b11000000 | (c >> 6));
+//             out[1] = @truncate(u8, 0b10000000 | c & 0b111111);
+//             return 2;
+//         },
+
+//         0b1110_0000...0b1110_1111 => {
+//             if (0xd800 <= c and c <= 0xdfff) {
+//                 // Replacement character
+//                 out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD };
+
+//                 return 3;
+//             }
+
+//             out[0] = @truncate(u8, 0b11100000 | (c >> 12));
+//             out[1] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111);
+//             out[2] = @truncate(u8, 0b10000000 | c & 0b111111);
+//             return 3;
+//         },
+//         0b1111_0000...0b1111_0111 => {
+//             out[0] = @truncate(u8, 0b11110000 | (c >> 18));
+//             out[1] = @truncate(u8, 0b10000000 | (c >> 12) & 0b111111);
+//             out[2] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111);
+//             out[3] = @truncate(u8, 0b10000000 | c & 0b111111);
+//             return 4;
+//         },
+//         else => {
+//             // Replacement character
+//             out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD };
+
+//             return 3;
+//         },
+//     }
+// }
+
 pub fn containsNonBmpCodePoint(text: string) bool {
     var iter = CodepointIterator.init(text);
     var curs = CodepointIterator.Cursor{};
author	Jarred Sumner <jarred@jarredsumner.com>	2021-10-25 05:42:01 -0700
committer	Jarred Sumner <jarred@jarredsumner.com>	2021-10-25 05:42:01 -0700
commit	4e889c7b47bbfb5c638b24e02906964015d9b3f2 (patch)
tree	f38565c9a1636d10b1349f12d55c2f8717ad980d
parent	2ed6605cc35adb8ee04d53d96e38617aa4597510 (diff)
download	bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.gz bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.tar.zst bun-4e889c7b47bbfb5c638b24e02906964015d9b3f2.zip