fix text decode trim (#4495)

* remove trim * separate function * a test * trim when `stream` is true --------- Co-authored-by: Jarred Sumner <jarred@jarredsumner.com>
author: Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-09-05 17:53:31 -0700
committer: GitHub <noreply@github.com> 2023-09-05 17:53:31 -0700
commit: 70a5cfe9087c9836cef06ee3560f6111ed2fe18e (patch)
tree: a8968f64234384cda482ae7e85beb7a9c3f4e7e6
parent: 1bd5b245b8a55353e60a2decad507ef8014be044 (diff)
download: bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.gz
bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.zst
bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.zip
3 files changed, 143 insertions, 5 deletions
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index 6f31fef82..53933fdb7 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -592,14 +592,22 @@ pub const TextDecoder = struct {
             return JSValue.zero;
         };
 
-        return this.decodeSlice(globalThis, array_buffer.slice());
+        if (arguments.len > 1 and arguments[1].isObject()) {
+            if (arguments[1].get(globalThis, "stream")) |stream| {
+                if (stream.toBoolean()) {
+                    return this.decodeSlice(globalThis, array_buffer.slice(), true);
+                }
+            }
+        }
+
+        return this.decodeSlice(globalThis, array_buffer.slice(), false);
     }
 
     pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) callconv(.C) JSValue {
-        return this.decodeSlice(globalThis, uint8array.slice());
+        return this.decodeSlice(globalThis, uint8array.slice(), false);
     }
 
-    fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8) JSValue {
+    fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime stream: bool) JSValue {
         switch (this.encoding) {
             EncodingLabel.latin1 => {
                 if (strings.isAllASCII(buffer_slice)) {
@@ -620,8 +628,9 @@ pub const TextDecoder = struct {
                 return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
             },
             EncodingLabel.@"UTF-8" => {
+                const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim;
                 if (this.fatal) {
-                    if (strings.toUTF16Alloc(default_allocator, buffer_slice, true)) |result_| {
+                    if (toUTF16(default_allocator, buffer_slice, true)) |result_| {
                         if (result_) |result| {
                             return ZigString.toExternalU16(result.ptr, result.len, globalThis);
                         }
@@ -640,7 +649,7 @@ pub const TextDecoder = struct {
                         }
                     }
                 } else {
-                    if (strings.toUTF16Alloc(default_allocator, buffer_slice, false)) |result_| {
+                    if (toUTF16(default_allocator, buffer_slice, false)) |result_| {
                         if (result_) |result| {
                             return ZigString.toExternalU16(result.ptr, result.len, globalThis);
                         }
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index c62266c62..d2d71621f 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1415,6 +1415,121 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
     return null;
 }
 
+pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
+    if (strings.firstNonASCII(bytes)) |i| {
+        const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
+            const out_length = bun.simdutf.length.utf16.from.utf8.le(bytes);
+
+            if (out_length == 0)
+                break :simd null;
+
+            var out = try allocator.alloc(u16, out_length);
+            log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length });
+
+            // avoid `.with_errors.le()` due to https://github.com/simdutf/simdutf/issues/213
+            switch (bun.simdutf.convert.utf8.to.utf16.le(bytes, out)) {
+                0 => {
+                    if (comptime fail_if_invalid) {
+                        allocator.free(out);
+                        return error.InvalidByteSequence;
+                    }
+
+                    break :simd .{
+                        .items = out[0..i],
+                        .capacity = out.len,
+                        .allocator = allocator,
+                    };
+                },
+                else => return out,
+            }
+        } else null;
+        var output = output_ orelse fallback: {
+            var list = try std.ArrayList(u16).initCapacity(allocator, i + 2);
+            list.items.len = i;
+            strings.copyU8IntoU16(list.items, bytes[0..i]);
+            break :fallback list;
+        };
+        errdefer output.deinit();
+
+        var remaining = bytes[i..];
+
+        {
+            const sequence: [4]u8 = switch (remaining.len) {
+                0 => unreachable,
+                1 => [_]u8{ remaining[0], 0, 0, 0 },
+                2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
+                3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
+                else => remaining[0..4].*,
+            };
+
+            const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+            if (comptime fail_if_invalid) {
+                if (replacement.fail) {
+                    if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
+                    return error.InvalidByteSequence;
+                }
+            }
+            remaining = remaining[@max(replacement.len, 1)..];
+
+            //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+            switch (replacement.code_point) {
+                0...0xffff => |c| {
+                    try output.append(@as(u16, @intCast(c)));
+                },
+                else => |c| {
+                    try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
+                },
+            }
+        }
+
+        while (strings.firstNonASCII(remaining)) |j| {
+            const end = output.items.len;
+            try output.ensureUnusedCapacity(j);
+            output.items.len += j;
+            strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
+            remaining = remaining[j..];
+
+            const sequence: [4]u8 = switch (remaining.len) {
+                0 => unreachable,
+                1 => [_]u8{ remaining[0], 0, 0, 0 },
+                2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
+                3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
+                else => remaining[0..4].*,
+            };
+
+            const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+            if (comptime fail_if_invalid) {
+                if (replacement.fail) {
+                    if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
+                    return error.InvalidByteSequence;
+                }
+            }
+            remaining = remaining[@max(replacement.len, 1)..];
+
+            //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+            switch (replacement.code_point) {
+                0...0xffff => |c| {
+                    try output.append(@as(u16, @intCast(c)));
+                },
+                else => |c| {
+                    try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
+                },
+            }
+        }
+
+        if (remaining.len > 0) {
+            try output.ensureTotalCapacityPrecise(output.items.len + remaining.len);
+
+            output.items.len += remaining.len;
+            strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining);
+        }
+
+        return output.items;
+    }
+
+    return null;
+}
+
 pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement {
     const c0 = @as(u21, input[0]);
 
diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js
index 8ec097f31..4991cf361 100644
--- a/test/js/web/encoding/text-decoder.test.js
+++ b/test/js/web/encoding/text-decoder.test.js
@@ -233,6 +233,20 @@ describe("TextDecoder", () => {
     }).toThrow(TypeError);
   });
 
+  it("should not trim invalid byte sequences when fatal is false", () => {
+    const buf = Buffer.from([77, 97, 110, 32, 208, 129, 240, 164, 173]);
+    const received = new TextDecoder("utf-8").decode(buf);
+    const expected = "Man Ё\ufffd";
+    expect(received).toBe(expected);
+  });
+
+  it("should trim when stream is true", () => {
+    const buf = Buffer.from([77, 97, 110, 32, 208, 129, 240, 164, 173]);
+    const received = new TextDecoder("utf-8").decode(buf, { stream: true });
+    const expected = "Man Ё";
+    expect(received).toBe(expected);
+  });
+
   it("constructor should set values", () => {
     const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false });
     expect(decoder.fatal).toBe(true);
author	Dylan Conway <35280289+dylan-conway@users.noreply.github.com>	2023-09-05 17:53:31 -0700
committer	GitHub <noreply@github.com>	2023-09-05 17:53:31 -0700
commit	70a5cfe9087c9836cef06ee3560f6111ed2fe18e (patch)
tree	a8968f64234384cda482ae7e85beb7a9c3f4e7e6
parent	1bd5b245b8a55353e60a2decad507ef8014be044 (diff)
download	bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.gz bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.zst bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.zip