fix utf16le fill and utf8 partial write of utf16 (#1943)

author: Ciro Spaciari <ciro.spaciari@gmail.com> 2023-01-30 17:21:23 -0300
committer: GitHub <noreply@github.com> 2023-01-30 12:21:23 -0800
commit: aa10799d8a9a69b828e36cd9d295f6d5867fb511 (patch)
tree: fde30e5b530ce25acb417e766e96ee3710eedcc3
parent: ec2c16fefa8b98efaa1ccf84f18eea0a12c1c9ef (diff)
download: bun-aa10799d8a9a69b828e36cd9d295f6d5867fb511.tar.gz
bun-aa10799d8a9a69b828e36cd9d295f6d5867fb511.tar.zst
bun-aa10799d8a9a69b828e36cd9d295f6d5867fb511.zip
5 files changed, 88 insertions, 28 deletions
diff --git a/src/bun.js/bindings/JSBuffer.cpp b/src/bun.js/bindings/JSBuffer.cpp
index 8436e24e7..10002b664 100644
--- a/src/bun.js/bindings/JSBuffer.cpp
+++ b/src/bun.js/bindings/JSBuffer.cpp
@@ -466,7 +466,7 @@ static inline JSC::EncodedJSValue jsBufferByteLengthFromStringAndEncoding(JSC::J
     }
 
     if (str->length() == 0)
-        RELEASE_AND_RETURN(scope, JSC::JSValue::encode(JSC::jsNumber(-1)));
+        RELEASE_AND_RETURN(scope, JSC::JSValue::encode(JSC::jsNumber(0)));
 
     int64_t written = 0;
 
diff --git a/src/bun.js/node/buffer.zig b/src/bun.js/node/buffer.zig
index 5637e45b6..8ede45f5d 100644
--- a/src/bun.js/node/buffer.zig
+++ b/src/bun.js/node/buffer.zig
@@ -28,37 +28,37 @@ pub const BufferVectorized = struct {
 
         const written = switch (encoding) {
             JSC.Node.Encoding.utf8 => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.utf8)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.utf8, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.utf8),
             JSC.Node.Encoding.ascii => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.ascii)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.ascii, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.ascii),
             JSC.Node.Encoding.latin1 => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.latin1)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.latin1, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.latin1),
             JSC.Node.Encoding.buffer => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.buffer)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.buffer, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.buffer),
             JSC.Node.Encoding.utf16le,
             JSC.Node.Encoding.ucs2,
             => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.utf16le)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.utf16le, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.utf16le),
             JSC.Node.Encoding.base64 => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.base64)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.base64, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.base64),
             JSC.Node.Encoding.base64url => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.base64url)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.base64url, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.base64url),
             JSC.Node.Encoding.hex => if (str.is16Bit())
-                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.hex)
+                JSC.WebCore.Encoder.writeU16(str.utf16SliceAligned().ptr, str.utf16SliceAligned().len, buf.ptr, buf.len, JSC.Node.Encoding.hex, true)
             else
                 JSC.WebCore.Encoder.writeU8(str.slice().ptr, str.slice().len, buf.ptr, buf.len, JSC.Node.Encoding.hex),
         };
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index 59c3f3866..d0b4bdd9a 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -688,14 +688,14 @@ pub const Encoder = struct {
     }
     export fn Bun__encoding__writeUTF16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, encoding: u8) i64 {
         return switch (@intToEnum(JSC.Node.Encoding, encoding)) {
-            .utf8 => writeU16(input, len, to, to_len, .utf8),
-            .latin1 => writeU16(input, len, to, to_len, .ascii),
-            .ascii => writeU16(input, len, to, to_len, .ascii),
-            .ucs2 => writeU16(input, len, to, to_len, .utf16le),
-            .utf16le => writeU16(input, len, to, to_len, .utf16le),
-            .base64 => writeU16(input, len, to, to_len, .base64),
-            .base64url => writeU16(input, len, to, to_len, .base64url),
-            .hex => writeU16(input, len, to, to_len, .hex),
+            .utf8 => writeU16(input, len, to, to_len, .utf8, false),
+            .latin1 => writeU16(input, len, to, to_len, .ascii, false),
+            .ascii => writeU16(input, len, to, to_len, .ascii, false),
+            .ucs2 => writeU16(input, len, to, to_len, .utf16le, false),
+            .utf16le => writeU16(input, len, to, to_len, .utf16le, false),
+            .base64 => writeU16(input, len, to, to_len, .base64, false),
+            .base64url => writeU16(input, len, to, to_len, .base64url, false),
+            .hex => writeU16(input, len, to, to_len, .hex, false),
             else => unreachable,
         };
     }
@@ -882,6 +882,9 @@ pub const Encoder = struct {
             },
             // encode latin1 into UTF16
             JSC.Node.Encoding.ucs2, JSC.Node.Encoding.utf16le => {
+                Output.println("writeU8 ucs2/utf16  {any} {any}", .{ len, to_len});
+                Output.flush();
+
                 if (to_len < 2)
                     return 0;
 
@@ -954,7 +957,7 @@ pub const Encoder = struct {
         }
     }
 
-    pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, comptime encoding: JSC.Node.Encoding) i64 {
+    pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, comptime encoding: JSC.Node.Encoding, comptime allow_partial_write: bool) i64 {
         if (len == 0)
             return 0;
 
@@ -969,14 +972,23 @@ pub const Encoder = struct {
             },
             // string is already encoded, just need to copy the data
             JSC.Node.Encoding.ucs2, JSC.Node.Encoding.utf16le => {
-                const bytes_input_len = len * 2;
-                const written = @min(bytes_input_len, to_len);
-                if (written < 2) return 0;
-
-                const fixed_len = (written / 2) * 2;
-                const input_u8 = @ptrCast([*]const u8, input);
-                strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..fixed_len]);
-                return @intCast(i64, fixed_len);
+                if(allow_partial_write) {
+                    const bytes_input_len = len * 2;
+                    const written = @min(bytes_input_len, to_len);
+                    const input_u8 = @ptrCast([*]const u8, input);
+                    strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..written]);
+                    return @intCast(i64, written);
+                } else {
+                    const bytes_input_len = len * 2;
+                    const written = @min(bytes_input_len, to_len);
+                    if (written < 2) return 0;
+
+                    const fixed_len = (written / 2) * 2;
+                    const input_u8 = @ptrCast([*]const u8, input);
+                    strings.copyU16IntoU8(to[0..written], []const u8, input_u8[0..fixed_len]);
+                    return @intCast(i64, fixed_len);
+                }
+                
             },
 
             JSC.Node.Encoding.hex => {
diff --git a/src/napi/napi.zig b/src/napi/napi.zig
index a65143a09..046ad36af 100644
--- a/src/napi/napi.zig
+++ b/src/napi/napi.zig
@@ -340,7 +340,7 @@ pub export fn napi_get_value_string_latin1(env: napi_env, value: napi_value, buf
 
     if (zig_str.is16Bit()) {
         const utf16 = zig_str.utf16SliceAligned();
-        const wrote = JSC.WebCore.Encoder.writeU16(utf16.ptr, utf16.len, buf, buf_.len, .latin1);
+        const wrote = JSC.WebCore.Encoder.writeU16(utf16.ptr, utf16.len, buf, buf_.len, .latin1, false);
         if (wrote < 0) {
             return .generic_failure;
         }
@@ -404,7 +404,7 @@ pub export fn napi_get_value_string_utf8(env: napi_env, value: napi_value, buf_p
 
     if (zig_str.is16Bit()) {
         const utf16 = zig_str.utf16SliceAligned();
-        const wrote = JSC.WebCore.Encoder.writeU16(utf16.ptr, utf16.len, buf, buf_.len, .utf8);
+        const wrote = JSC.WebCore.Encoder.writeU16(utf16.ptr, utf16.len, buf, buf_.len, .utf8, false);
         if (wrote < 0) {
             return .generic_failure;
         }
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index e949892e1..8cc2ab7b9 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -2545,6 +2545,54 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
         const width: usize = replacement.utf8Width();
         if (width > remaining.len) {
             ended_on_non_ascii = width > 1;
+            switch (width) {
+                2 => {
+                    if (remaining.len > 0) {
+                        //only first will be written
+                        remaining[0] = @truncate(u8, 0xC0 | (replacement.code_point >> 6));
+                        remaining = remaining[remaining.len..];
+                    }
+                },
+                3 => {
+                    //only first to second written
+                    switch (remaining.len) {
+                        1 => {
+                            remaining[0] = @truncate(u8, 0xE0 | (replacement.code_point >> 12));
+                            remaining = remaining[remaining.len..];
+                        },
+                        2 => {
+                            remaining[0] = @truncate(u8, 0xE0 | (replacement.code_point >> 12));
+                            remaining[1] = @truncate(u8, 0x80 | (replacement.code_point >> 6) & 0x3F);
+                            remaining = remaining[remaining.len..];
+                        },
+                        else => {},
+                    }
+                    
+                },
+                4 => {
+                    //only 1 to 3 written
+                    switch (remaining.len) {
+                        1 => {
+                            remaining[0] = @truncate(u8, 0xF0 | (replacement.code_point >> 18));
+                            remaining = remaining[remaining.len..];
+                        },
+                        2 => {
+                            remaining[0] = @truncate(u8, 0xF0 | (replacement.code_point >> 18));
+                            remaining[1] = @truncate(u8, 0x80 | (replacement.code_point >> 12) & 0x3F);
+                            remaining = remaining[remaining.len..];
+                        },
+                        3 => {
+                            remaining[0] = @truncate(u8, 0xF0 | (replacement.code_point >> 18));
+                            remaining[1] = @truncate(u8, 0x80 | (replacement.code_point >> 12) & 0x3F);
+                            remaining[3] = @truncate(u8, 0x80 | (replacement.code_point >> 0) & 0x3F);
+                            remaining = remaining[remaining.len..];
+                        },
+                        else => {},
+                    }
+                },
+
+                else => {},
+            }
             break;
         }
author	Ciro Spaciari <ciro.spaciari@gmail.com>	2023-01-30 17:21:23 -0300
committer	GitHub <noreply@github.com>	2023-01-30 12:21:23 -0800
commit	aa10799d8a9a69b828e36cd9d295f6d5867fb511 (patch)
tree	fde30e5b530ce25acb417e766e96ee3710eedcc3
parent	ec2c16fefa8b98efaa1ccf84f18eea0a12c1c9ef (diff)
download	bun-aa10799d8a9a69b828e36cd9d295f6d5867fb511.tar.gz bun-aa10799d8a9a69b828e36cd9d295f6d5867fb511.tar.zst bun-aa10799d8a9a69b828e36cd9d295f6d5867fb511.zip