Fix all known string encoding bugs

author: Jarred Sumner <jarred@jarredsumner.com> 2021-10-25 00:52:07 -0700
committer: Jarred Sumner <jarred@jarredsumner.com> 2021-10-25 00:52:07 -0700
commit: 42c264bf7b45bdf7944d10260beeaf7c8b50a21a (patch)
tree: aaf099275cbafcf0b253a48dda30db2dc987ed66 /src/string_immutable.zig
parent: fe6564b5332a72116f68c1c95ae7da86fe2ca668 (diff)
download: bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.tar.gz
bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.tar.zst
bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.zip
1 files changed, 28 insertions, 25 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 0030b8708..9bfd8df77 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -549,39 +549,52 @@ pub fn utf16EqlString(text: []const u16, str: string) bool {
 // This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using
 // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info.
 pub fn encodeWTF8Rune(p: []u8, r: i32) u3 {
-    // Negative values are erroneous. Making it unsigned addresses the problem.
-    const i = @intCast(u32, r);
-    switch (i) {
+    return @call(
+        .{
+            .modifier = .always_inline,
+        },
+        encodeWTF8RuneT,
+        .{
+            p,
+            u32,
+            @intCast(u32, r),
+        },
+    );
+}
+
+pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 {
+    switch (r) {
         0...0x7F => {
             p[0] = @intCast(u8, r);
             return 1;
         },
         (0x7F + 1)...0x7FF => {
-            p[0] = 0xC0 | @intCast(u8, r >> 6);
-            p[1] = 0x80 | @intCast(u8, r) & 0x3F;
+            p[0] = @truncate(u8, 0xC0 | ((r >> 6)));
+            p[1] = @truncate(u8, 0x80 | (r & 0x3F));
             return 2;
         },
         (0x7FF + 1)...0xFFFF => {
-            p[0] = 0xE0 | @intCast(u8, r >> 12);
-            p[1] = 0x80 | @intCast(u8, r >> 6) & 0x3F;
-            p[2] = 0x80 | @intCast(u8, r) & 0x3F;
+            p[0] = @truncate(u8, 0xE0 | ((r >> 12)));
+            p[1] = @truncate(u8, 0x80 | ((r >> 6) & 0x3F));
+            p[2] = @truncate(u8, 0x80 | (r & 0x3F));
             return 3;
         },
         else => {
-            p[0] = 0xF0 | @intCast(u8, r >> 18);
-            p[1] = 0x80 | @intCast(u8, r >> 12) & 0x3F;
-            p[2] = 0x80 | @intCast(u8, r >> 6) & 0x3F;
-            p[3] = 0x80 | @intCast(u8, r) & 0x3F;
+            p[0] = @truncate(u8, 0xF0 | ((r >> 18)));
+            p[1] = @truncate(u8, 0x80 | ((r >> 12) & 0x3F));
+            p[2] = @truncate(u8, 0x80 | ((r >> 6) & 0x3F));
+            p[3] = @truncate(u8, 0x80 | (r & 0x3F));
             return 4;
         },
     }
 }
 
 pub fn containsNonBmpCodePoint(text: string) bool {
-    var iter = std.unicode.Utf8Iterator{ .bytes = text, .i = 0 };
+    var iter = CodepointIterator.init(text);
+    var curs = CodepointIterator.Cursor{};
 
-    while (iter.nextCodepoint()) |codepoint| {
-        if (codepoint > 0xFFFF) {
+    while (iter.next(&curs)) {
+        if (curs.c > 0xFFFF) {
             return true;
         }
     }
@@ -668,16 +681,6 @@ pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 {
     };
 }
 
-pub inline fn utf8ByteSequenceLength32(first_byte: u8) u32 {
-    return switch (first_byte) {
-        0b0000_0000...0b0111_1111 => 1,
-        0b1100_0000...0b1101_1111 => 2,
-        0b1110_0000...0b1110_1111 => 3,
-        0b1111_0000...0b1111_0111 => 4,
-        else => 0,
-    };
-}
-
 pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type {
     return struct {
         const Iterator = @This();
author	Jarred Sumner <jarred@jarredsumner.com>	2021-10-25 00:52:07 -0700
committer	Jarred Sumner <jarred@jarredsumner.com>	2021-10-25 00:52:07 -0700
commit	42c264bf7b45bdf7944d10260beeaf7c8b50a21a (patch)
tree	aaf099275cbafcf0b253a48dda30db2dc987ed66 /src/string_immutable.zig
parent	fe6564b5332a72116f68c1c95ae7da86fe2ca668 (diff)
download	bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.tar.gz bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.tar.zst bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.zip