aboutsummaryrefslogtreecommitdiff
path: root/src/string_immutable.zig
diff options
context:
space:
mode:
authorGravatar Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-02-08 14:42:10 -0800
committerGravatar GitHub <noreply@github.com> 2023-02-08 14:42:10 -0800
commit6fdbb25f9a04e7404e12ab4e157df99930c6dddd (patch)
treeb4f405eff9eb518b909b3976c6f77f57f8419002 /src/string_immutable.zig
parentb8c0554839832b1cf9695cb9e2375488565412d0 (diff)
downloadbun-6fdbb25f9a04e7404e12ab4e157df99930c6dddd.tar.gz
bun-6fdbb25f9a04e7404e12ab4e157df99930c6dddd.tar.zst
bun-6fdbb25f9a04e7404e12ab4e157df99930c6dddd.zip
utf16 to utf8 conversion validation (#2001)
* use replacement character for invalid surrogate pairs * return index of non-ascii * non-allocating case * edge cases * function rename * oops * get length once, index counter
Diffstat (limited to 'src/string_immutable.zig')
-rw-r--r--src/string_immutable.zig44
1 files changed, 37 insertions, 7 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 63dd70090..5408e97bb 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1227,12 +1227,37 @@ pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement {
}
}
+pub fn convertUTF16ToUTF8(list_: std.ArrayList(u8), comptime Type: type, utf16: Type) !std.ArrayList(u8) {
+ var list = list_;
+
+ var remaining_input = utf16;
+ var start: usize = 0;
+
+ const replacement_char = [_]u8{ 239, 191, 189 };
+ var result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remaining_input, list.items.ptr[start..list.capacity]);
+ list.items.len = result.count;
+ while (result.status == .surrogate) {
+ try list.ensureUnusedCapacity(3);
+ list.items.len += 3;
+ start += result.count;
+
+ list.items[start..][0..replacement_char.len].* = replacement_char;
+ remaining_input = remaining_input[result.count + 1 ..];
+ start += replacement_char.len;
+
+ result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(remaining_input, list.items.ptr[start..list.capacity]);
+ list.items.len += result.count;
+ }
+
+ return list;
+}
+
pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 {
if (bun.FeatureFlags.use_simdutf and comptime Type == []const u16) {
const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
- var list = try allocator.alloc(u8, length);
- _ = bun.simdutf.convert.utf16.to.utf8.le(utf16, list);
- return list;
+ var list = try std.ArrayList(u8).initCapacity(allocator, length);
+ list = try convertUTF16ToUTF8(list, Type, utf16);
+ return list.items;
}
var list = try std.ArrayList(u8).initCapacity(allocator, utf16.len);
@@ -1245,8 +1270,7 @@ pub fn toUTF8ListWithType(list_: std.ArrayList(u8), comptime Type: type, utf16:
var list = list_;
const length = bun.simdutf.length.utf8.from.utf16.le(utf16);
try list.ensureTotalCapacityPrecise(length);
- list.items.len += bun.simdutf.convert.utf16.to.utf8.le(utf16, list.items.ptr[0..length]);
- return list;
+ return convertUTF16ToUTF8(list, Type, utf16);
}
return toUTF8ListWithTypeBun(list_, Type, utf16);
@@ -3457,16 +3481,22 @@ pub fn firstNonASCII16CheckMin(comptime Slice: type, slice: Slice, comptime chec
}
if (comptime check_min) {
+ var i: usize = 0;
for (remaining) |char| {
if (char > 127 or char < 0x20) {
- return @truncate(u32, (@ptrToInt(std.mem.sliceAsBytes(remaining).ptr) - @ptrToInt(std.mem.sliceAsBytes(slice).ptr)) / 2);
+ return @truncate(u32, i);
}
+
+ i += 1;
}
} else {
+ var i: usize = 0;
for (remaining) |char| {
if (char > 127) {
- return @truncate(u32, (@ptrToInt(std.mem.sliceAsBytes(remaining).ptr) - @ptrToInt(std.mem.sliceAsBytes(slice).ptr)) / 2);
+ return @truncate(u32, i);
}
+
+ i += 1;
}
}