Rewrite the CodepointIterator to fix some bugs

author: Jarred Sumner <jarred@jarredsumner.com> 2021-10-23 04:58:20 -0700
committer: Jarred Sumner <jarred@jarredsumner.com> 2021-10-23 04:58:20 -0700
commit: de01d581c164679140172d3950bd97da64036b7e (patch)
tree: b397ffcb8371889c3329a2075117e4b45b36e20a
parent: f1bda194e0cb21b2050c36d4ae71d7424d42397f (diff)
download: bun-de01d581c164679140172d3950bd97da64036b7e.tar.gz
bun-de01d581c164679140172d3950bd97da64036b7e.tar.zst
bun-de01d581c164679140172d3950bd97da64036b7e.zip
1 files changed, 89 insertions, 81 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 94fc29b09..a6f018192 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -492,9 +492,9 @@ pub fn toUTF8Alloc(allocator: *std.mem.Allocator, js: []const u16) !string {
     var list = std.ArrayList(u8).initCapacity(allocator, js.len) catch unreachable;
     var i: usize = 0;
     while (i < js.len) : (i += 1) {
-        var r1 = @intCast(i32, js[i]);
+        var r1 = @as(i32, js[i]);
         if (r1 >= 0xD800 and r1 <= 0xDBFF and i + 1 < js.len) {
-            const r2 = @intCast(i32, js[i] + 1);
+            const r2 = @as(i32, js[i] + 1);
             if (r2 >= 0xDC00 and r2 <= 0xDFFF) {
                 r1 = (r1 - 0xD800) << 10 | (r2 - 0xDC00) + 0x10000;
                 i += 1;
@@ -577,57 +577,6 @@ pub fn encodeWTF8Rune(p: []u8, r: i32) u3 {
     }
 }
 
-pub fn toUTF16Buf(in: string, out: []u16) usize {
-    var utf8Iterator = CodepointIterator.init(in);
-
-    var c: u21 = 0;
-    var i: usize = 0;
-    while (true) {
-        const code_point = utf8Iterator.nextCodepoint();
-
-        switch (code_point) {
-            -1 => {
-                return i;
-            },
-            0...0xFFFF => {
-                out[i] = @intCast(u16, code_point);
-                i += 1;
-            },
-            else => {
-                c = code_point - 0x10000;
-                out[i] = @intCast(u16, 0xD800 + ((c >> 10) & 0x3FF));
-                i += 1;
-                out[i] = @intCast(u16, 0xDC00 + (c & 0x3FF));
-                i += 1;
-            },
-        }
-    }
-
-    return i;
-}
-
-pub fn toUTF16Alloc(in: string, allocator: *std.mem.Allocator) !JavascriptString {
-    var utf8Iterator = CodepointIterator.init(in);
-    var out = try std.ArrayList(u16).initCapacity(allocator, in.len);
-
-    var c: u21 = 0;
-    var i: usize = 0;
-    while (utf8Iterator.nextCodepoint()) |code_point| {
-        switch (code_point) {
-            0...0xFFFF => {
-                try out.append(@intCast(u16, code_point));
-            },
-            else => {
-                c = code_point - 0x10000;
-                try out.append(@intCast(u16, 0xD800 + ((c >> 10) & 0x3FF)));
-                try out.append(@intCast(u16, 0xDC00 + (c & 0x3FF)));
-            },
-        }
-    }
-
-    return out.toOwnedSlice();
-}
-
 pub fn containsNonBmpCodePoint(text: string) bool {
     var iter = std.unicode.Utf8Iterator{ .bytes = text, .i = 0 };
 
@@ -653,16 +602,20 @@ pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool {
     const n = _text.len;
     if (n > 0) {
         var i: usize = 0;
-        var c: u16 = 0;
-        var c2: u16 = 0;
         var text = _text[0 .. n - 1];
         while (i < n - 1) : (i += 1) {
-            c = text[i];
-            if (c >= 0xD800 and c <= 0xDBFF) {
-                c2 = text[i + 1];
-                if (c2 >= 0xDC00 and c2 <= 0xDFFF) {
-                    return true;
-                }
+            switch (text[i]) {
+                // Check for a high surrogate
+                0xD800...0xDBFF => {
+                    // Check for a low surrogate
+                    switch (text[i + 1]) {
+                        0xDC00...0xDFFF => {
+                            return true;
+                        },
+                        else => {},
+                    }
+                },
+                else => {},
             }
         }
     }
@@ -705,7 +658,17 @@ pub fn toASCIIHexValue(character: u8) u8 {
     };
 }
 
-pub fn utf8ByteSequenceLength(first_byte: u8) u3 {
+pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 {
+    return switch (first_byte) {
+        0b0000_0000...0b0111_1111 => 1,
+        0b1100_0000...0b1101_1111 => 2,
+        0b1110_0000...0b1110_1111 => 3,
+        0b1111_0000...0b1111_0111 => 4,
+        else => 0,
+    };
+}
+
+pub inline fn utf8ByteSequenceLength32(first_byte: u8) u32 {
     return switch (first_byte) {
         0b0000_0000...0b0111_1111 => 1,
         0b1100_0000...0b1101_1111 => 2,
@@ -720,24 +683,61 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
         const Iterator = @This();
         bytes: []const u8,
         i: usize,
+        next_width: usize = 0,
         width: u3 = 0,
-        c: CodePointType = 0,
+        c: CodePointType = zeroValue,
+
+        pub const Cursor = struct {
+            i: u32 = 0,
+            c: CodePointType = zeroValue,
+            width: u3 = 0,
+        };
 
-        pub fn init(str: string) CodepointIterator {
-            return CodepointIterator{ .bytes = str, .i = 0, .width = 0, .c = 0 };
+        pub fn init(str: string) Iterator {
+            return Iterator{ .bytes = str, .i = 0, .c = zeroValue };
         }
 
-        pub fn initOffset(str: string, i: usize) CodepointIterator {
-            return CodepointIterator{ .bytes = str, .i = i, .width = 0, .c = 0 };
+        pub fn initOffset(str: string, i: usize) Iterator {
+            return Iterator{ .bytes = str, .i = i, .c = zeroValue };
+        }
+
+        pub inline fn next(it: *const Iterator, cursor: *Cursor) bool {
+            const pos: u32 = @as(u32, cursor.width) + cursor.i;
+            if (pos >= it.bytes.len) {
+                return false;
+            }
+
+            const cp_len = utf8ByteSequenceLength(it.bytes[pos]);
+            cursor.* = Cursor{
+                .i = pos,
+                .c = @as(
+                    CodePointType,
+                    switch (cp_len) {
+                        1 => it.bytes[pos],
+                        2 => std.unicode.utf8Decode2(it.bytes[pos..][0..2]) catch return false,
+                        3 => std.unicode.utf8Decode3(it.bytes[pos..][0..3]) catch return false,
+                        4 => std.unicode.utf8Decode4(it.bytes[pos..][0..4]) catch return false,
+                        else => return false,
+                    },
+                ),
+                .width = cp_len,
+            };
+            return true;
         }
 
         inline fn nextCodepointSlice(it: *Iterator) []const u8 {
-            @setRuntimeSafety(false);
+            const bytes = it.bytes;
+            const prev = it.i;
+            const next_ = prev + it.next_width;
+            if (bytes.len <= next_) return "";
 
-            const cp_len = utf8ByteSequenceLength(it.bytes[it.i]);
-            it.i += cp_len;
+            const cp_len = utf8ByteSequenceLength(bytes[next_]);
+            it.next_width = cp_len;
+            it.i = @minimum(next_, bytes.len);
 
-            return if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else "";
+            const slice = bytes[prev..][0..cp_len];
+            it.width = @intCast(u3, slice.len);
+            return slice;
         }
 
         pub fn needsUTF8Decoding(slice: string) bool {
@@ -745,9 +745,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
 
             while (true) {
                 const part = it.nextCodepointSlice();
-                it.width = @intCast(u3, part.len);
                 @setRuntimeSafety(false);
-                switch (it.width) {
+                switch (part.len) {
                     0 => return false,
                     1 => continue,
                     else => return true,
@@ -756,8 +755,6 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
         }
 
         pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize {
-            @setRuntimeSafety(false);
-
             while (iter.c > -1) {
                 if (!switch (iter.nextCodepoint()) {
                     quote => false,
@@ -778,10 +775,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
 
         pub fn nextCodepoint(it: *Iterator) CodePointType {
             const slice = it.nextCodepointSlice();
-            it.width = @intCast(u3, slice.len);
-            @setRuntimeSafety(false);
 
-            it.c = switch (it.width) {
+            it.c = switch (slice.len) {
                 0 => zeroValue,
                 1 => @intCast(CodePointType, slice[0]),
                 2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable),
@@ -793,12 +788,25 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co
             return it.c;
         }
 
+        pub fn nextCodepointNullable(it: *Iterator) ?CodePointType {
+            const slice = it.nextCodepointSlice();
+            if (slice.len == 0) return null;
+
+            it.c = switch (slice.len) {
+                1 => @intCast(CodePointType, slice[0]),
+                2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable),
+                3 => @intCast(CodePointType, std.unicode.utf8Decode3(slice) catch unreachable),
+                4 => @intCast(CodePointType, std.unicode.utf8Decode4(slice) catch unreachable),
+                else => unreachable,
+            };
+
+            return it.c;
+        }
+
         pub fn nextCodepointNoReturn(it: *Iterator) void {
             const slice = it.nextCodepointSlice();
-            it.width = @intCast(u3, slice.len);
-            @setRuntimeSafety(false);
 
-            it.c = switch (it.width) {
+            it.c = switch (slice.len) {
                 0 => zeroValue,
                 1 => @intCast(CodePointType, slice[0]),
                 2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable),
author	Jarred Sumner <jarred@jarredsumner.com>	2021-10-23 04:58:20 -0700
committer	Jarred Sumner <jarred@jarredsumner.com>	2021-10-23 04:58:20 -0700
commit	de01d581c164679140172d3950bd97da64036b7e (patch)
tree	b397ffcb8371889c3329a2075117e4b45b36e20a
parent	f1bda194e0cb21b2050c36d4ae71d7424d42397f (diff)
download	bun-de01d581c164679140172d3950bd97da64036b7e.tar.gz bun-de01d581c164679140172d3950bd97da64036b7e.tar.zst bun-de01d581c164679140172d3950bd97da64036b7e.zip