fix text encoding for utf8 (#1967)

* use character * replacement character * also test encoding decoded points * increase length by 1
author: Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-02-01 18:48:09 -0800
committer: GitHub <noreply@github.com> 2023-02-01 18:48:09 -0800
commit: 3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99 (patch)
tree: b775f00e684da35bb58e1f4d951df88e6e0dc733
parent: 76f3c9c07b1db01ec4d0ae5361f0b1a1030ae528 (diff)
download: bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.gz
bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.zst
bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.zip
4 files changed, 76 insertions, 6 deletions
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index 9725073a2..6729cc4de 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -99,6 +99,13 @@ pub const TextEncoder = struct {
         // max utf16 -> utf8 length
         if (slice.len <= buf.len / 4) {
             const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice);
+            if (result.read == 0 or result.written == 0) {
+                const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
+                const array_buffer = uint8array.asArrayBuffer(globalThis).?;
+                const replacement_char = [_]u8{ 239, 191, 189 };
+                @memcpy(array_buffer.slice().ptr, &replacement_char, replacement_char.len);
+                return uint8array;
+            }
             const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
             std.debug.assert(result.written <= buf.len);
             std.debug.assert(result.read == slice.len);
@@ -214,8 +221,11 @@ pub const TextEncoder = struct {
     ) u64 {
         var output = buf_ptr[0..buf_len];
         const input = input_ptr[0..input_len];
-        const result: strings.EncodeIntoResult =
-            strings.copyUTF16IntoUTF8(output, []const u16, input);
+        const result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input);
+        if (result.read == 0 or result.written == 0) {
+            const replacement_char = [_]u8{ 239, 191, 189 };
+            @memcpy(buf_ptr, &replacement_char, replacement_char.len);
+        }
         const sized: [2]u32 = .{ result.read, result.written };
         return @bitCast(u64, sized);
     }
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 8cc2ab7b9..cf6f6126c 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -2481,9 +2481,8 @@ const latin1_to_utf16_conversion_table = [256]u16{
 };
 
 pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 {
-    const as_utf16 = latin1ToCodepointBytesAssumeNotASCII16(char);
     var bytes = [4]u8{ 0, 0, 0, 0 };
-    _ = encodeWTF8Rune(&bytes, @intCast(i32, as_utf16));
+    _ = encodeWTF8Rune(&bytes, @intCast(i32, char));
     return bytes[0..2].*;
 }
 
@@ -2567,7 +2566,6 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
                         },
                         else => {},
                     }
-                    
                 },
                 4 => {
                     //only 1 to 3 written
diff --git a/test/bun.js/text-encoder.test.js b/test/bun.js/text-encoder.test.js
index 3f1d2acee..1afc3c9d5 100644
--- a/test/bun.js/text-encoder.test.js
+++ b/test/bun.js/text-encoder.test.js
@@ -52,7 +52,7 @@ describe("TextEncoder", () => {
     expect(into2).toEqual(repeatedResult);
   });
 
-  it("should encode latin1 text", () => {
+  it("should encode latin1 text", async () => {
     gcTrace(true);
     const text = "Hello World!";
     const encoder = new TextEncoder();
@@ -66,6 +66,68 @@ describe("TextEncoder", () => {
     for (let i = 0; i < result.length; i++) {
       expect(encoded[i]).toBe(result[i]);
     }
+
+    let t = [
+      {
+        str: "\u009c\u0097",
+        expected: [194, 156, 194, 151],
+      },
+      {
+        str: "世",
+        expected: [228, 184, 150],
+      },
+      // Less than 0, out of range.
+      {
+        str: -1,
+        expected: [45, 49],
+      },
+      // Greater than 0x10FFFF, out of range.
+      {
+        str: 0x110000,
+        expected: [49, 49, 49, 52, 49, 49, 50],
+      },
+      // The Unicode replacement character.
+      {
+        str: "\uFFFD",
+        expected: [239, 191, 189],
+      },
+    ];
+    for (let { str, expected } of t) {
+      let utf8 = new TextEncoder().encode(str);
+      expect([...utf8]).toEqual(expected);
+    }
+
+    expect([...new TextEncoder().encode(String.fromCodePoint(0))]).toEqual([0]);
+
+    const fixture = new Uint8Array(await Bun.file("utf8-encoding-fixture.txt").arrayBuffer());
+    const length = 0x110000;
+    let textEncoder = new TextEncoder();
+    let textDecoder = new TextDecoder();
+    let encodeOut = new Uint8Array(length * 4);
+    let encodeIntoOut = new Uint8Array(length * 4);
+    let encodeIntoBuffer = new Uint8Array(4);
+    let encodeDecodedOut = new Uint8Array(length * 4);
+    for (let i = 0, offset = 0; i < length; i++, offset += 4) {
+      const s = String.fromCodePoint(i);
+      const u = textEncoder.encode(s);
+      encodeOut.set(u, offset);
+
+      textEncoder.encodeInto(s, encodeIntoBuffer);
+      encodeIntoOut.set(encodeIntoBuffer, offset);
+
+      const decoded = textDecoder.decode(encodeIntoBuffer);
+      const encoded = textEncoder.encode(decoded);
+      encodeDecodedOut.set(encoded, offset);
+    }
+
+    expect(encodeOut).toEqual(fixture);
+    expect(encodeIntoOut).toEqual(fixture);
+    expect(encodeOut).toEqual(encodeIntoOut);
+    expect(encodeDecodedOut).toEqual(encodeOut);
+    expect(encodeDecodedOut).toEqual(encodeIntoOut);
+    expect(encodeDecodedOut).toEqual(fixture);
+
+    expect(() => textEncoder.encode(String.fromCodePoint(length + 1))).toThrow();
   });
 
   it("should encode long latin1 text", async () => {
diff --git a/test/bun.js/utf8-encoding-fixture.txt b/test/bun.js/utf8-encoding-fixture.txt
new file mode 100644
index 000000000..1f9ecf34f
--- /dev/null
+++ b/test/bun.js/utf8-encoding-fixture.txt
author	Dylan Conway <35280289+dylan-conway@users.noreply.github.com>	2023-02-01 18:48:09 -0800
committer	GitHub <noreply@github.com>	2023-02-01 18:48:09 -0800
commit	3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99 (patch)
tree	b775f00e684da35bb58e1f4d951df88e6e0dc733
parent	76f3c9c07b1db01ec4d0ae5361f0b1a1030ae528 (diff)
download	bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.gz bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.zst bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.zip