diff options
| author | 2023-02-01 18:48:09 -0800 | |
|---|---|---|
| committer | 2023-02-01 18:48:09 -0800 | |
| commit | 3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99 (patch) | |
| tree | b775f00e684da35bb58e1f4d951df88e6e0dc733 | |
| parent | 76f3c9c07b1db01ec4d0ae5361f0b1a1030ae528 (diff) | |
| download | bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.gz bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.zst bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.zip | |
fix text encoding for utf8 (#1967)
* use character
* replacement character
* also test encoding decoded points
* increase length by 1
| -rw-r--r-- | src/bun.js/webcore/encoding.zig | 14 | ||||
| -rw-r--r-- | src/string_immutable.zig | 4 | ||||
| -rw-r--r-- | test/bun.js/text-encoder.test.js | 64 | ||||
| -rw-r--r-- | test/bun.js/utf8-encoding-fixture.txt | bin | 0 -> 4456448 bytes | 
4 files changed, 76 insertions, 6 deletions
| diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 9725073a2..6729cc4de 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -99,6 +99,13 @@ pub const TextEncoder = struct {          // max utf16 -> utf8 length          if (slice.len <= buf.len / 4) {              const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice); +            if (result.read == 0 or result.written == 0) { +                const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); +                const array_buffer = uint8array.asArrayBuffer(globalThis).?; +                const replacement_char = [_]u8{ 239, 191, 189 }; +                @memcpy(array_buffer.slice().ptr, &replacement_char, replacement_char.len); +                return uint8array; +            }              const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);              std.debug.assert(result.written <= buf.len);              std.debug.assert(result.read == slice.len); @@ -214,8 +221,11 @@ pub const TextEncoder = struct {      ) u64 {          var output = buf_ptr[0..buf_len];          const input = input_ptr[0..input_len]; -        const result: strings.EncodeIntoResult = -            strings.copyUTF16IntoUTF8(output, []const u16, input); +        const result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input); +        if (result.read == 0 or result.written == 0) { +            const replacement_char = [_]u8{ 239, 191, 189 }; +            @memcpy(buf_ptr, &replacement_char, replacement_char.len); +        }          const sized: [2]u32 = .{ result.read, result.written };          return @bitCast(u64, sized);      } diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 8cc2ab7b9..cf6f6126c 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -2481,9 +2481,8 @@ const latin1_to_utf16_conversion_table = [256]u16{  };  pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 { -    const as_utf16 = latin1ToCodepointBytesAssumeNotASCII16(char);      var bytes = [4]u8{ 0, 0, 0, 0 }; -    _ = encodeWTF8Rune(&bytes, @intCast(i32, as_utf16)); +    _ = encodeWTF8Rune(&bytes, @intCast(i32, char));      return bytes[0..2].*;  } @@ -2567,7 +2566,6 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,                          },                          else => {},                      } -                                      },                  4 => {                      //only 1 to 3 written diff --git a/test/bun.js/text-encoder.test.js b/test/bun.js/text-encoder.test.js index 3f1d2acee..1afc3c9d5 100644 --- a/test/bun.js/text-encoder.test.js +++ b/test/bun.js/text-encoder.test.js @@ -52,7 +52,7 @@ describe("TextEncoder", () => {      expect(into2).toEqual(repeatedResult);    }); -  it("should encode latin1 text", () => { +  it("should encode latin1 text", async () => {      gcTrace(true);      const text = "Hello World!";      const encoder = new TextEncoder(); @@ -66,6 +66,68 @@ describe("TextEncoder", () => {      for (let i = 0; i < result.length; i++) {        expect(encoded[i]).toBe(result[i]);      } + +    let t = [ +      { +        str: "\u009c\u0097", +        expected: [194, 156, 194, 151], +      }, +      { +        str: "δΈ–", +        expected: [228, 184, 150], +      }, +      // Less than 0, out of range. +      { +        str: -1, +        expected: [45, 49], +      }, +      // Greater than 0x10FFFF, out of range. +      { +        str: 0x110000, +        expected: [49, 49, 49, 52, 49, 49, 50], +      }, +      // The Unicode replacement character. +      { +        str: "\uFFFD", +        expected: [239, 191, 189], +      }, +    ]; +    for (let { str, expected } of t) { +      let utf8 = new TextEncoder().encode(str); +      expect([...utf8]).toEqual(expected); +    } + +    expect([...new TextEncoder().encode(String.fromCodePoint(0))]).toEqual([0]); + +    const fixture = new Uint8Array(await Bun.file("utf8-encoding-fixture.txt").arrayBuffer()); +    const length = 0x110000; +    let textEncoder = new TextEncoder(); +    let textDecoder = new TextDecoder(); +    let encodeOut = new Uint8Array(length * 4); +    let encodeIntoOut = new Uint8Array(length * 4); +    let encodeIntoBuffer = new Uint8Array(4); +    let encodeDecodedOut = new Uint8Array(length * 4); +    for (let i = 0, offset = 0; i < length; i++, offset += 4) { +      const s = String.fromCodePoint(i); +      const u = textEncoder.encode(s); +      encodeOut.set(u, offset); + +      textEncoder.encodeInto(s, encodeIntoBuffer); +      encodeIntoOut.set(encodeIntoBuffer, offset); + +      const decoded = textDecoder.decode(encodeIntoBuffer); +      const encoded = textEncoder.encode(decoded); +      encodeDecodedOut.set(encoded, offset); +    } + +    expect(encodeOut).toEqual(fixture); +    expect(encodeIntoOut).toEqual(fixture); +    expect(encodeOut).toEqual(encodeIntoOut); +    expect(encodeDecodedOut).toEqual(encodeOut); +    expect(encodeDecodedOut).toEqual(encodeIntoOut); +    expect(encodeDecodedOut).toEqual(fixture); + +    expect(() => textEncoder.encode(String.fromCodePoint(length + 1))).toThrow();    });    it("should encode long latin1 text", async () => { diff --git a/test/bun.js/utf8-encoding-fixture.txt b/test/bun.js/utf8-encoding-fixture.txtBinary files differ new file mode 100644 index 000000000..1f9ecf34f --- /dev/null +++ b/test/bun.js/utf8-encoding-fixture.txt | 
