aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-02-01 18:48:09 -0800
committerGravatar GitHub <noreply@github.com> 2023-02-01 18:48:09 -0800
commit3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99 (patch)
treeb775f00e684da35bb58e1f4d951df88e6e0dc733
parent76f3c9c07b1db01ec4d0ae5361f0b1a1030ae528 (diff)
downloadbun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.gz
bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.tar.zst
bun-3c23f9ad5787bc9e3bd61b7df4c0cdb0fb9f7b99.zip
fix text encoding for utf8 (#1967)
* use character * replacement character * also test encoding decoded points * increase length by 1
-rw-r--r--src/bun.js/webcore/encoding.zig14
-rw-r--r--src/string_immutable.zig4
-rw-r--r--test/bun.js/text-encoder.test.js64
-rw-r--r--test/bun.js/utf8-encoding-fixture.txtbin0 -> 4456448 bytes
4 files changed, 76 insertions, 6 deletions
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index 9725073a2..6729cc4de 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -99,6 +99,13 @@ pub const TextEncoder = struct {
// max utf16 -> utf8 length
if (slice.len <= buf.len / 4) {
const result = strings.copyUTF16IntoUTF8(&buf, @TypeOf(slice), slice);
+ if (result.read == 0 or result.written == 0) {
+ const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
+ const array_buffer = uint8array.asArrayBuffer(globalThis).?;
+ const replacement_char = [_]u8{ 239, 191, 189 };
+ @memcpy(array_buffer.slice().ptr, &replacement_char, replacement_char.len);
+ return uint8array;
+ }
const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
std.debug.assert(result.written <= buf.len);
std.debug.assert(result.read == slice.len);
@@ -214,8 +221,11 @@ pub const TextEncoder = struct {
) u64 {
var output = buf_ptr[0..buf_len];
const input = input_ptr[0..input_len];
- const result: strings.EncodeIntoResult =
- strings.copyUTF16IntoUTF8(output, []const u16, input);
+ const result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input);
+ if (result.read == 0 or result.written == 0) {
+ const replacement_char = [_]u8{ 239, 191, 189 };
+ @memcpy(buf_ptr, &replacement_char, replacement_char.len);
+ }
const sized: [2]u32 = .{ result.read, result.written };
return @bitCast(u64, sized);
}
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 8cc2ab7b9..cf6f6126c 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -2481,9 +2481,8 @@ const latin1_to_utf16_conversion_table = [256]u16{
};
pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 {
- const as_utf16 = latin1ToCodepointBytesAssumeNotASCII16(char);
var bytes = [4]u8{ 0, 0, 0, 0 };
- _ = encodeWTF8Rune(&bytes, @intCast(i32, as_utf16));
+ _ = encodeWTF8Rune(&bytes, @intCast(i32, char));
return bytes[0..2].*;
}
@@ -2567,7 +2566,6 @@ pub fn copyUTF16IntoUTF8WithBuffer(buf: []u8, comptime Type: type, utf16: Type,
},
else => {},
}
-
},
4 => {
//only 1 to 3 written
diff --git a/test/bun.js/text-encoder.test.js b/test/bun.js/text-encoder.test.js
index 3f1d2acee..1afc3c9d5 100644
--- a/test/bun.js/text-encoder.test.js
+++ b/test/bun.js/text-encoder.test.js
@@ -52,7 +52,7 @@ describe("TextEncoder", () => {
expect(into2).toEqual(repeatedResult);
});
- it("should encode latin1 text", () => {
+ it("should encode latin1 text", async () => {
gcTrace(true);
const text = "Hello World!";
const encoder = new TextEncoder();
@@ -66,6 +66,68 @@ describe("TextEncoder", () => {
for (let i = 0; i < result.length; i++) {
expect(encoded[i]).toBe(result[i]);
}
+
+ let t = [
+ {
+ str: "\u009c\u0097",
+ expected: [194, 156, 194, 151],
+ },
+ {
+ str: "δΈ–",
+ expected: [228, 184, 150],
+ },
+ // Less than 0, out of range.
+ {
+ str: -1,
+ expected: [45, 49],
+ },
+ // Greater than 0x10FFFF, out of range.
+ {
+ str: 0x110000,
+ expected: [49, 49, 49, 52, 49, 49, 50],
+ },
+ // The Unicode replacement character.
+ {
+ str: "\uFFFD",
+ expected: [239, 191, 189],
+ },
+ ];
+ for (let { str, expected } of t) {
+ let utf8 = new TextEncoder().encode(str);
+ expect([...utf8]).toEqual(expected);
+ }
+
+ expect([...new TextEncoder().encode(String.fromCodePoint(0))]).toEqual([0]);
+
+ const fixture = new Uint8Array(await Bun.file("utf8-encoding-fixture.txt").arrayBuffer());
+ const length = 0x110000;
+ let textEncoder = new TextEncoder();
+ let textDecoder = new TextDecoder();
+ let encodeOut = new Uint8Array(length * 4);
+ let encodeIntoOut = new Uint8Array(length * 4);
+ let encodeIntoBuffer = new Uint8Array(4);
+ let encodeDecodedOut = new Uint8Array(length * 4);
+ for (let i = 0, offset = 0; i < length; i++, offset += 4) {
+ const s = String.fromCodePoint(i);
+ const u = textEncoder.encode(s);
+ encodeOut.set(u, offset);
+
+ textEncoder.encodeInto(s, encodeIntoBuffer);
+ encodeIntoOut.set(encodeIntoBuffer, offset);
+
+ const decoded = textDecoder.decode(encodeIntoBuffer);
+ const encoded = textEncoder.encode(decoded);
+ encodeDecodedOut.set(encoded, offset);
+ }
+
+ expect(encodeOut).toEqual(fixture);
+ expect(encodeIntoOut).toEqual(fixture);
+ expect(encodeOut).toEqual(encodeIntoOut);
+ expect(encodeDecodedOut).toEqual(encodeOut);
+ expect(encodeDecodedOut).toEqual(encodeIntoOut);
+ expect(encodeDecodedOut).toEqual(fixture);
+
+ expect(() => textEncoder.encode(String.fromCodePoint(length + 1))).toThrow();
});
it("should encode long latin1 text", async () => {
diff --git a/test/bun.js/utf8-encoding-fixture.txt b/test/bun.js/utf8-encoding-fixture.txt
new file mode 100644
index 000000000..1f9ecf34f
--- /dev/null
+++ b/test/bun.js/utf8-encoding-fixture.txt
Binary files differ