Cleanup some of the encoding code

author: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com> 2022-11-27 22:09:56 -0800
committer: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com> 2022-11-27 22:09:56 -0800
commit: 4ee11d922cb9fb4b97ae07028c409ffba153a1d4 (patch)
tree: 014ece7037771eefacbf4cf3bf7df76dce0988db
parent: 885049831f9c09f9e41e80d66062d0b5d2525257 (diff)
download: bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.tar.gz
bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.tar.zst
bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.zip
3 files changed, 51 insertions, 35 deletions
diff --git a/src/bun.js/bindings/bun-simdutf.zig b/src/bun.js/bindings/bun-simdutf.zig
index f84ce56ce..531e9c3ef 100644
--- a/src/bun.js/bindings/bun-simdutf.zig
+++ b/src/bun.js/bindings/bun-simdutf.zig
@@ -292,48 +292,52 @@ pub const length = struct {
 
 pub const trim = struct {
     pub fn utf8_len(buf: []const u8) usize {
-        if (buf.len < 3) {
-            switch (buf.len) {
+        const len = buf.len;
+
+        if (len < 3) {
+            switch (len) {
                 2 => {
-                    if (buf[buf.len - 1] >= 0b11000000) {
-                        return buf.len - 1;
+                    if (buf[len - 1] >= 0b11000000) {
+                        return len - 1;
                     } // 2-, 3- and 4-byte characters with only 1 byte left
-                    if (buf[buf.len - 2] >= 0b11100000) {
-                        return buf.len - 2;
+                    if (buf[len - 2] >= 0b11100000) {
+                        return len - 2;
                     } // 3- and 4-byte characters with only 2 bytes left
-                    return buf.len;
+                    return len;
                 },
                 1 => {
-                    if (buf[buf.len - 1] >= 0b11000000) {
-                        return buf.len - 1;
+                    if (buf[len - 1] >= 0b11000000) {
+                        return len - 1;
                     } // 2-, 3- and 4-byte characters with only 1 byte left
-                    return buf.len;
+                    return len;
                 },
-                0 => return buf.len,
+                0 => return len,
                 else => unreachable,
             }
         }
 
-        if (buf[buf.len - 1] >= 0b11000000) {
-            return buf.len - 1;
+        if (buf[len - 1] >= 0b11000000) {
+            return len - 1;
         } // 2-, 3- and 4-byte characters with only 1 byte left
-        if (buf[buf.len - 2] >= 0b11100000) {
-            return buf.len - 2;
+        if (buf[len - 2] >= 0b11100000) {
+            return len - 2;
         } // 3- and 4-byte characters with only 1 byte left
-        if (buf[buf.len - 3] >= 0b11110000) {
-            return buf.len - 3;
+        if (buf[len - 3] >= 0b11110000) {
+            return len - 3;
         } // 4-byte characters with only 3 bytes left
-        return buf.len;
+        return len;
     }
 
     pub fn utf16_len(buf: []const u16) usize {
-        if (buf.len == 0) {
+        const len = buf.len;
+
+        if (len == 0) {
             return 0;
         }
-        if ((buf[buf.len - 1] >= 0xD800) and (buf[buf.len - 1] <= 0xDBFF)) {
-            return buf.len - 1;
+        if ((buf[len - 1] >= 0xD800) and (buf[len - 1] <= 0xDBFF)) {
+            return len - 1;
         }
-        return buf.len;
+        return len;
     }
 
     pub fn utf16(buf: []const u16) []const u16 {
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index 99b304bda..297f3cc0a 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -875,7 +875,6 @@ pub const Encoder = struct {
                 return ZigString.init(to).toExternalValue(global);
             },
             .buffer, .utf8 => {
-                // JSC only supports UTF-16 strings for non-ascii text
                 const converted = strings.toUTF16Alloc(allocator, input, false) catch return ZigString.init("Out of memory").toErrorInstance(global);
                 if (converted) |utf16| {
                     return ZigString.toExternalU16(utf16.ptr, utf16.len, global);
@@ -886,11 +885,11 @@ pub const Encoder = struct {
                 return ZigString.init(input).toValueGC(global);
             },
             .ucs2, .utf16le => {
-                var output = allocator.alloc(u16, len / 2) catch return ZigString.init("Out of memory").toErrorInstance(global);
-                var i: usize = 0;
-                while (i < len / 2) : (i += 1) {
-                    output[i] = (@intCast(u16, input[2 * i + 1]) << 8) + @intCast(u16, input[2 * i]);
-                }
+                var output = allocator.alloc(u16, @maximum(len / 2, 1)) catch return ZigString.init("Out of memory").toErrorInstance(global);
+                var output_bytes = std.mem.sliceAsBytes(output);
+                output_bytes[output_bytes.len - 1] = 0;
+
+                @memcpy(output_bytes.ptr, input_ptr, output_bytes.len);
                 return ZigString.toExternalU16(output.ptr, output.len, global);
             },
 
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 1eb46ee96..3b6915201 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -936,19 +936,26 @@ const strings = @This();
 /// This is intended to be used for strings that go to JavaScript
 pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
     var first_non_ascii: ?u32 = null;
+    var output_: ?std.ArrayList(u16) = null;
 
     if (bun.FeatureFlags.use_simdutf) {
         if (bytes.len == 0)
             return &[_]u16{};
 
-        if (bun.simdutf.validate.ascii(bytes))
+        const validated = bun.simdutf.validate.with_errors.ascii(bytes);
+        if (validated.status == .success)
             return null;
 
-        const trimmed = bun.simdutf.trim.utf8(bytes);
+        const offset = @truncate(u32, validated.count);
+
+        const trimmed = bun.simdutf.trim.utf8(bytes[offset..]);
         const out_length = bun.simdutf.length.utf16.from.utf8.le(trimmed);
-        var out = try allocator.alloc(u16, out_length);
+        var out = try allocator.alloc(u16, out_length + offset);
+
+        if (offset > 0)
+            strings.copyU8IntoU16(out[0..offset], bytes[0..offset]);
 
-        const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out);
+        const result = bun.simdutf.convert.utf8.to.utf16.with_errors.le(trimmed, out[offset..]);
         switch (result.status) {
             .success => {
                 return out;
@@ -959,7 +966,12 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
                     return error.InvalidByteSequence;
                 }
 
-                first_non_ascii = @truncate(u32, result.count);
+                first_non_ascii = @truncate(u32, result.count) + offset;
+                output_ = std.ArrayList(u16){
+                    .items = out[0..first_non_ascii.?],
+                    .capacity = out.len,
+                    .allocator = allocator,
+                };
             },
         }
     }
@@ -967,10 +979,11 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
     if (first_non_ascii orelse strings.firstNonASCII(bytes)) |i| {
         const ascii = bytes[0..i];
         const chunk = bytes[i..];
-        var output = try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2);
+        var output = output_ orelse try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2);
         errdefer output.deinit();
         output.items.len = ascii.len;
-        strings.copyU8IntoU16(output.items, ascii);
+        if (first_non_ascii == null)
+            strings.copyU8IntoU16(output.items, ascii);
 
         var remaining = chunk;
author	Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>	2022-11-27 22:09:56 -0800
committer	Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com>	2022-11-27 22:09:56 -0800
commit	4ee11d922cb9fb4b97ae07028c409ffba153a1d4 (patch)
tree	014ece7037771eefacbf4cf3bf7df76dce0988db
parent	885049831f9c09f9e41e80d66062d0b5d2525257 (diff)
download	bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.tar.gz bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.tar.zst bun-4ee11d922cb9fb4b97ae07028c409ffba153a1d4.zip