diff options
author | 2022-11-27 22:09:56 -0800 | |
---|---|---|
committer | 2022-11-27 22:09:56 -0800 | |
commit | 4ee11d922cb9fb4b97ae07028c409ffba153a1d4 (patch) | |
tree | 014ece7037771eefacbf4cf3bf7df76dce0988db | |
parent | 885049831f9c09f9e41e80d66062d0b5d2525257 (diff) | |
download | bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.tar.gz bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.tar.zst bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.zip |
Cleanup some of the encoding code
-rw-r--r-- | src/bun.js/bindings/bun-simdutf.zig | 48 | ||||
-rw-r--r-- | src/bun.js/webcore/encoding.zig | 11 | ||||
-rw-r--r-- | src/string_immutable.zig | 27 |
3 files changed, 51 insertions, 35 deletions
diff --git a/src/bun.js/bindings/bun-simdutf.zig b/src/bun.js/bindings/bun-simdutf.zig index f84ce56ce..531e9c3ef 100644 --- a/src/bun.js/bindings/bun-simdutf.zig +++ b/src/bun.js/bindings/bun-simdutf.zig @@ -292,48 +292,52 @@ pub const length = struct { pub const trim = struct { pub fn utf8_len(buf: []const u8) usize { - if (buf.len < 3) { - switch (buf.len) { + const len = buf.len; + + if (len < 3) { + switch (len) { 2 => { - if (buf[buf.len - 1] >= 0b11000000) { - return buf.len - 1; + if (buf[len - 1] >= 0b11000000) { + return len - 1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[buf.len - 2] >= 0b11100000) { - return buf.len - 2; + if (buf[len - 2] >= 0b11100000) { + return len - 2; } // 3- and 4-byte characters with only 2 bytes left - return buf.len; + return len; }, 1 => { - if (buf[buf.len - 1] >= 0b11000000) { - return buf.len - 1; + if (buf[len - 1] >= 0b11000000) { + return len - 1; } // 2-, 3- and 4-byte characters with only 1 byte left - return buf.len; + return len; }, - 0 => return buf.len, + 0 => return len, else => unreachable, } } - if (buf[buf.len - 1] >= 0b11000000) { - return buf.len - 1; + if (buf[len - 1] >= 0b11000000) { + return len - 1; } // 2-, 3- and 4-byte characters with only 1 byte left - if (buf[buf.len - 2] >= 0b11100000) { - return buf.len - 2; + if (buf[len - 2] >= 0b11100000) { + return len - 2; } // 3- and 4-byte characters with only 1 byte left - if (buf[buf.len - 3] >= 0b11110000) { - return buf.len - 3; + if (buf[len - 3] >= 0b11110000) { + return len - 3; } // 4-byte characters with only 3 bytes left - return buf.len; + return len; } pub fn utf16_len(buf: []const u16) usize { - if (buf.len == 0) { + const len = buf.len; + + if (len == 0) { return 0; } - if ((buf[buf.len - 1] >= 0xD800) and (buf[buf.len - 1] <= 0xDBFF)) { - return buf.len - 1; + if ((buf[len - 1] >= 0xD800) and (buf[len - 1] <= 0xDBFF)) { + return len - 1; } - return buf.len; + return len; } pub fn utf16(buf: []const u16) []const u16 { diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 99b304bda..297f3cc0a 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -875,7 +875,6 @@ pub const Encoder = struct { return ZigString.init(to).toExternalValue(global); }, .buffer, .utf8 => { - // JSC only supports UTF-16 strings for non-ascii text const converted = strings.toUTF16Alloc(allocator, input, false) catch return ZigString.init("Out of memory").toErrorInstance(global); if (converted) |utf16| { return ZigString.toExternalU16(utf16.ptr, utf16.len, global); @@ -886,11 +885,11 @@ pub const Encoder = struct { return ZigString.init(input).toValueGC(global); }, .ucs2, .utf16le => { - var output = allocator.alloc(u16, len / 2) catch return ZigString.init("Out of memory").toErrorInstance(global); - var i: usize = 0; - while (i < len / 2) : (i += 1) { - output[i] = (@intCast(u16, input[2 * i + 1]) << 8) + @intCast(u16, input[2 * i]); - } + var output = allocator.alloc(u16, @maximum(len / 2, 1)) catch return ZigString.init("Out of memory").toErrorInstance(global); + var output_bytes = std.mem.sliceAsBytes(output); + output_bytes[output_bytes.len - 1] = 0; + + @memcpy(output_bytes.ptr, input_ptr, output_bytes.len); return ZigString.toExternalU16(output.ptr, output.len, global); }, diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 1eb46ee96..3b6915201 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -936,19 +936,26 @@ const strings = @This(); /// This is intended to be used for strings that go to JavaScript pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { var first_non_ascii: ?u32 = null; + var output_: ?std.ArrayList(u16) = null; if (bun.FeatureFlags.use_simdutf) { if (bytes.len == 0) return &[_]u16{}; - if (bun.simdutf.validate.ascii(bytes)) + const validated = bun.simdutf.validate.with_errors.ascii(bytes); + if (validated.status == .success) return null; - const trimmed = bun.simdutf.trim.utf8(bytes); + const offset = @truncate(u32, validated.count); + + const trimmed = bun.simdutf.trim.utf8(bytes[offset..]); const out_length = bun.simdutf.length.utf16.from.utf8.le(trimmed); - var out = try allocator.alloc(u16, out_length); + var out = try allocator.alloc(u16, out_length + offset); + + if (offset > 0) + strings.copyU8IntoU16(out[0..offset], bytes[0..offset]); - const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out); + const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out[offset..]); switch (result.status) { .success => { return out; @@ -959,7 +966,12 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa return error.InvalidByteSequence; } - first_non_ascii = @truncate(u32, result.count); + first_non_ascii = @truncate(u32, result.count) + offset; + output_ = std.ArrayList(u16){ + .items = out[0..first_non_ascii.?], + .capacity = out.len, + .allocator = allocator, + }; }, } } @@ -967,10 +979,11 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa if (first_non_ascii orelse strings.firstNonASCII(bytes)) |i| { const ascii = bytes[0..i]; const chunk = bytes[i..]; - var output = try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2); + var output = output_ orelse try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2); errdefer output.deinit(); output.items.len = ascii.len; - strings.copyU8IntoU16(output.items, ascii); + if (first_non_ascii == null) + strings.copyU8IntoU16(output.items, ascii); var remaining = chunk; |