diff options
-rw-r--r-- | src/bun.js/webcore/encoding.zig | 14 | ||||
-rw-r--r-- | src/string_immutable.zig | 4 | ||||
-rw-r--r-- | test/bun.js/text-encoder.test.js | 64 | ||||
-rw-r--r-- | test/bun.js/utf8-encoding-fixture.txt | bin | 0 -> 4456448 bytes |
4 files changed, 76 insertions, 6 deletions
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 9725073a2..6729cc4de 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -99,6 +99,13 @@ pub const TextEncoder = struct { // max utf16 -> utf8 length if (slice.len <= buf.len / 4) { const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice); + if (result.read == 0 or result.written == 0) { + const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3); + const array_buffer = uint8array.asArrayBuffer(globalThis).?; + const replacement_char = [_]u8{ 239, 191, 189 }; + @memcpy(array_buffer.slice().ptr, &replacement_char, replacement_char.len); + return uint8array; + } const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written); std.debug.assert(result.written <= buf.len); std.debug.assert(result.read == slice.len); @@ -214,8 +221,11 @@ pub const TextEncoder = struct { ) u64 { var output = buf_ptr[0..buf_len]; const input = input_ptr[0..input_len]; - const result: strings.EncodeIntoResult = - strings.copyUTF16IntoUTF8(output, []const u16, input); + const result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input); + if (result.read == 0 or result.written == 0) { + const replacement_char = [_]u8{ 239, 191, 189 }; + @memcpy(buf_ptr, &replacement_char, replacement_char.len); + } const sized: [2]u32 = .{ result.read, result.written }; return @bitCast(u64, sized); } diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 8cc2ab7b9..cf6f6126c 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -2481,9 +2481,8 @@ const latin1_to_utf16_conversion_table = [256]u16{ }; pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 { - const as_utf16 = latin1ToCodepointBytesAssumeNotASCII16(char); var bytes = [4]u8{ 0, 0, 0, 0 }; - _ = encodeWTF8Rune(&bytes, @intCast(i32, as_utf16)); + _ = encodeWTF8Rune(&bytes, @intCast(i32, char)); return bytes[0..2].*; } @@ -2567,7 +2566,6 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type, }, else => {}, } - }, 4 => { //only 1 to 3 written diff --git a/test/bun.js/text-encoder.test.js b/test/bun.js/text-encoder.test.js index 3f1d2acee..1afc3c9d5 100644 --- a/test/bun.js/text-encoder.test.js +++ b/test/bun.js/text-encoder.test.js @@ -52,7 +52,7 @@ describe("TextEncoder", () => { expect(into2).toEqual(repeatedResult); }); - it("should encode latin1 text", () => { + it("should encode latin1 text", async () => { gcTrace(true); const text = "Hello World!"; const encoder = new TextEncoder(); @@ -66,6 +66,68 @@ describe("TextEncoder", () => { for (let i = 0; i < result.length; i++) { expect(encoded[i]).toBe(result[i]); } + + let t = [ + { + str: "\u009c\u0097", + expected: [194, 156, 194, 151], + }, + { + str: "δΈ–", + expected: [228, 184, 150], + }, + // Less than 0, out of range. + { + str: -1, + expected: [45, 49], + }, + // Greater than 0x10FFFF, out of range. + { + str: 0x110000, + expected: [49, 49, 49, 52, 49, 49, 50], + }, + // The Unicode replacement character. + { + str: "\uFFFD", + expected: [239, 191, 189], + }, + ]; + for (let { str, expected } of t) { + let utf8 = new TextEncoder().encode(str); + expect([...utf8]).toEqual(expected); + } + + expect([...new TextEncoder().encode(String.fromCodePoint(0))]).toEqual([0]); + + const fixture = new Uint8Array(await Bun.file("utf8-encoding-fixture.txt").arrayBuffer()); + const length = 0x110000; + let textEncoder = new TextEncoder(); + let textDecoder = new TextDecoder(); + let encodeOut = new Uint8Array(length * 4); + let encodeIntoOut = new Uint8Array(length * 4); + let encodeIntoBuffer = new Uint8Array(4); + let encodeDecodedOut = new Uint8Array(length * 4); + for (let i = 0, offset = 0; i < length; i++, offset += 4) { + const s = String.fromCodePoint(i); + const u = textEncoder.encode(s); + encodeOut.set(u, offset); + + textEncoder.encodeInto(s, encodeIntoBuffer); + encodeIntoOut.set(encodeIntoBuffer, offset); + + const decoded = textDecoder.decode(encodeIntoBuffer); + const encoded = textEncoder.encode(decoded); + encodeDecodedOut.set(encoded, offset); + } + + expect(encodeOut).toEqual(fixture); + expect(encodeIntoOut).toEqual(fixture); + expect(encodeOut).toEqual(encodeIntoOut); + expect(encodeDecodedOut).toEqual(encodeOut); + expect(encodeDecodedOut).toEqual(encodeIntoOut); + expect(encodeDecodedOut).toEqual(fixture); + + expect(() => textEncoder.encode(String.fromCodePoint(length + 1))).toThrow(); }); it("should encode long latin1 text", async () => { diff --git a/test/bun.js/utf8-encoding-fixture.txt b/test/bun.js/utf8-encoding-fixture.txt Binary files differnew file mode 100644 index 000000000..1f9ecf34f --- /dev/null +++ b/test/bun.js/utf8-encoding-fixture.txt |