aboutsummaryrefslogtreecommitdiff
path: root/src/string_immutable.zig
diff options
context:
space:
mode:
Diffstat (limited to 'src/string_immutable.zig')
-rw-r--r--src/string_immutable.zig61
1 files changed, 49 insertions, 12 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 7bdd7cfe5..16caa133f 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1220,6 +1220,40 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
return null;
}
+pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement {
+ const c0 = @as(u21, input[0]);
+
+ if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
+ // surrogate pair
+ if (input.len == 1)
+ return .{
+ .len = 1,
+ };
+ //error.DanglingSurrogateHalf;
+ const c1 = @as(u21, input[1]);
+ if (c1 & ~@as(u21, 0x03ff) != 0xdc00)
+ if (input.len == 1) {
+ return .{
+ .len = 1,
+ };
+ } else {
+ return .{
+ .fail = true,
+ .len = 1,
+ .code_point = unicode_replacement,
+ };
+ };
+ // return error.ExpectedSecondSurrogateHalf;
+
+ return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) };
+ } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) {
+ // return error.UnexpectedSecondSurrogateHalf;
+ return .{ .fail = true, .len = 1, .code_point = unicode_replacement };
+ } else {
+ return .{ .code_point = c0, .len = 1 };
+ }
+}
+
pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement {
const c0 = @as(u21, input[0]);
@@ -2576,16 +2610,19 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
var utf16_remaining = utf16;
var ended_on_non_ascii = false;
- if (comptime Type == []const u16) {
- if (bun.FeatureFlags.use_simdutf) {
- log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len });
-
- if (remaining.len >= out_len) {
- const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(trimmed, remaining[0..out_len]);
- return EncodeIntoResult{
- .read = @truncate(u32, trimmed.len),
- .written = @truncate(u32, result.count),
- };
+ brk: {
+ if (comptime Type == []const u16) {
+ if (bun.FeatureFlags.use_simdutf) {
+ log("UTF16 {d} -> UTF8 {d}", .{ utf16.len, out_len });
+ if (remaining.len >= out_len) {
+ const result = bun.simdutf.convert.utf16.to.utf8.with_errors.le(trimmed, remaining);
+ if (result.status == .surrogate) break :brk;
+
+ return EncodeIntoResult{
+ .read = @truncate(u32, trimmed.len),
+ .written = @truncate(u32, result.count),
+ };
+ }
}
}
}
@@ -2599,7 +2636,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
if (@min(utf16_remaining.len, remaining.len) == 0)
break;
- const replacement = utf16Codepoint(Type, utf16_remaining);
+ const replacement = utf16CodepointWithFFFD(Type, utf16_remaining);
const width: usize = replacement.utf8Width();
if (width > remaining.len) {
@@ -2642,7 +2679,7 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
3 => {
remaining[0] = @truncate(u8, 0xF0 | (replacement.code_point >> 18));
remaining[1] = @truncate(u8, 0x80 | (replacement.code_point >> 12) & 0x3F);
- remaining[3] = @truncate(u8, 0x80 | (replacement.code_point >> 0) & 0x3F);
+ remaining[2] = @truncate(u8, 0x80 | (replacement.code_point >> 6) & 0x3F);
remaining = remaining[remaining.len..];
},
else => {},