diff options
| author | 2021-10-23 04:58:20 -0700 | |
|---|---|---|
| committer | 2021-10-23 04:58:20 -0700 | |
| commit | de01d581c164679140172d3950bd97da64036b7e (patch) | |
| tree | b397ffcb8371889c3329a2075117e4b45b36e20a | |
| parent | f1bda194e0cb21b2050c36d4ae71d7424d42397f (diff) | |
| download | bun-de01d581c164679140172d3950bd97da64036b7e.tar.gz bun-de01d581c164679140172d3950bd97da64036b7e.tar.zst bun-de01d581c164679140172d3950bd97da64036b7e.zip | |
Rewrite the CodepointIterator to fix some bugs
| -rw-r--r-- | src/string_immutable.zig | 170 |
1 files changed, 89 insertions, 81 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 94fc29b09..a6f018192 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -492,9 +492,9 @@ pub fn toUTF8Alloc(allocator: *std.mem.Allocator, js: []const u16) !string { var list = std.ArrayList(u8).initCapacity(allocator, js.len) catch unreachable; var i: usize = 0; while (i < js.len) : (i += 1) { - var r1 = @intCast(i32, js[i]); + var r1 = @as(i32, js[i]); if (r1 >= 0xD800 and r1 <= 0xDBFF and i + 1 < js.len) { - const r2 = @intCast(i32, js[i] + 1); + const r2 = @as(i32, js[i] + 1); if (r2 >= 0xDC00 and r2 <= 0xDFFF) { r1 = (r1 - 0xD800) << 10 | (r2 - 0xDC00) + 0x10000; i += 1; @@ -577,57 +577,6 @@ pub fn encodeWTF8Rune(p: []u8, r: i32) u3 { } } -pub fn toUTF16Buf(in: string, out: []u16) usize { - var utf8Iterator = CodepointIterator.init(in); - - var c: u21 = 0; - var i: usize = 0; - while (true) { - const code_point = utf8Iterator.nextCodepoint(); - - switch (code_point) { - -1 => { - return i; - }, - 0...0xFFFF => { - out[i] = @intCast(u16, code_point); - i += 1; - }, - else => { - c = code_point - 0x10000; - out[i] = @intCast(u16, 0xD800 + ((c >> 10) & 0x3FF)); - i += 1; - out[i] = @intCast(u16, 0xDC00 + (c & 0x3FF)); - i += 1; - }, - } - } - - return i; -} - -pub fn toUTF16Alloc(in: string, allocator: *std.mem.Allocator) !JavascriptString { - var utf8Iterator = CodepointIterator.init(in); - var out = try std.ArrayList(u16).initCapacity(allocator, in.len); - - var c: u21 = 0; - var i: usize = 0; - while (utf8Iterator.nextCodepoint()) |code_point| { - switch (code_point) { - 0...0xFFFF => { - try out.append(@intCast(u16, code_point)); - }, - else => { - c = code_point - 0x10000; - try out.append(@intCast(u16, 0xD800 + ((c >> 10) & 0x3FF))); - try out.append(@intCast(u16, 0xDC00 + (c & 0x3FF))); - }, - } - } - - return out.toOwnedSlice(); -} - pub fn containsNonBmpCodePoint(text: string) bool { var iter = std.unicode.Utf8Iterator{ .bytes = text, .i = 0 }; @@ -653,16 +602,20 @@ pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool { const n = _text.len; if (n > 0) { var i: usize = 0; - var c: u16 = 0; - var c2: u16 = 0; var text = _text[0 .. n - 1]; while (i < n - 1) : (i += 1) { - c = text[i]; - if (c >= 0xD800 and c <= 0xDBFF) { - c2 = text[i + 1]; - if (c2 >= 0xDC00 and c2 <= 0xDFFF) { - return true; - } + switch (text[i]) { + // Check for a high surrogate + 0xD800...0xDBFF => { + // Check for a low surrogate + switch (text[i + 1]) { + 0xDC00...0xDFFF => { + return true; + }, + else => {}, + } + }, + else => {}, } } } @@ -705,7 +658,17 @@ pub fn toASCIIHexValue(character: u8) u8 { }; } -pub fn utf8ByteSequenceLength(first_byte: u8) u3 { +pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 { + return switch (first_byte) { + 0b0000_0000...0b0111_1111 => 1, + 0b1100_0000...0b1101_1111 => 2, + 0b1110_0000...0b1110_1111 => 3, + 0b1111_0000...0b1111_0111 => 4, + else => 0, + }; +} + +pub inline fn utf8ByteSequenceLength32(first_byte: u8) u32 { return switch (first_byte) { 0b0000_0000...0b0111_1111 => 1, 0b1100_0000...0b1101_1111 => 2, @@ -720,24 +683,61 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co const Iterator = @This(); bytes: []const u8, i: usize, + next_width: usize = 0, width: u3 = 0, - c: CodePointType = 0, + c: CodePointType = zeroValue, + + pub const Cursor = struct { + i: u32 = 0, + c: CodePointType = zeroValue, + width: u3 = 0, + }; - pub fn init(str: string) CodepointIterator { - return CodepointIterator{ .bytes = str, .i = 0, .width = 0, .c = 0 }; + pub fn init(str: string) Iterator { + return Iterator{ .bytes = str, .i = 0, .c = zeroValue }; } - pub fn initOffset(str: string, i: usize) CodepointIterator { - return CodepointIterator{ .bytes = str, .i = i, .width = 0, .c = 0 }; + pub fn initOffset(str: string, i: usize) Iterator { + return Iterator{ .bytes = str, .i = i, .c = zeroValue }; + } + + pub inline fn next(it: *const Iterator, cursor: *Cursor) bool { + const pos: u32 = @as(u32, cursor.width) + cursor.i; + if (pos >= it.bytes.len) { + return false; + } + + const cp_len = utf8ByteSequenceLength(it.bytes[pos]); + cursor.* = Cursor{ + .i = pos, + .c = @as( + CodePointType, + switch (cp_len) { + 1 => it.bytes[pos], + 2 => std.unicode.utf8Decode2(it.bytes[pos..][0..2]) catch return false, + 3 => std.unicode.utf8Decode3(it.bytes[pos..][0..3]) catch return false, + 4 => std.unicode.utf8Decode4(it.bytes[pos..][0..4]) catch return false, + else => return false, + }, + ), + .width = cp_len, + }; + return true; } inline fn nextCodepointSlice(it: *Iterator) []const u8 { - @setRuntimeSafety(false); + const bytes = it.bytes; + const prev = it.i; + const next_ = prev + it.next_width; + if (bytes.len <= next_) return ""; - const cp_len = utf8ByteSequenceLength(it.bytes[it.i]); - it.i += cp_len; + const cp_len = utf8ByteSequenceLength(bytes[next_]); + it.next_width = cp_len; + it.i = @minimum(next_, bytes.len); - return if (!(it.i > it.bytes.len)) it.bytes[it.i - cp_len .. it.i] else ""; + const slice = bytes[prev..][0..cp_len]; + it.width = @intCast(u3, slice.len); + return slice; } pub fn needsUTF8Decoding(slice: string) bool { @@ -745,9 +745,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co while (true) { const part = it.nextCodepointSlice(); - it.width = @intCast(u3, part.len); @setRuntimeSafety(false); - switch (it.width) { + switch (part.len) { 0 => return false, 1 => continue, else => return true, @@ -756,8 +755,6 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co } pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize { - @setRuntimeSafety(false); - while (iter.c > -1) { if (!switch (iter.nextCodepoint()) { quote => false, @@ -778,10 +775,8 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co pub fn nextCodepoint(it: *Iterator) CodePointType { const slice = it.nextCodepointSlice(); - it.width = @intCast(u3, slice.len); - @setRuntimeSafety(false); - it.c = switch (it.width) { + it.c = switch (slice.len) { 0 => zeroValue, 1 => @intCast(CodePointType, slice[0]), 2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable), @@ -793,12 +788,25 @@ pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: co return it.c; } + pub fn nextCodepointNullable(it: *Iterator) ?CodePointType { + const slice = it.nextCodepointSlice(); + if (slice.len == 0) return null; + + it.c = switch (slice.len) { + 1 => @intCast(CodePointType, slice[0]), + 2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable), + 3 => @intCast(CodePointType, std.unicode.utf8Decode3(slice) catch unreachable), + 4 => @intCast(CodePointType, std.unicode.utf8Decode4(slice) catch unreachable), + else => unreachable, + }; + + return it.c; + } + pub fn nextCodepointNoReturn(it: *Iterator) void { const slice = it.nextCodepointSlice(); - it.width = @intCast(u3, slice.len); - @setRuntimeSafety(false); - it.c = switch (it.width) { + it.c = switch (slice.len) { 0 => zeroValue, 1 => @intCast(CodePointType, slice[0]), 2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable), |
