diff options
author | 2023-09-05 17:53:31 -0700 | |
---|---|---|
committer | 2023-09-05 17:53:31 -0700 | |
commit | 70a5cfe9087c9836cef06ee3560f6111ed2fe18e (patch) | |
tree | a8968f64234384cda482ae7e85beb7a9c3f4e7e6 | |
parent | 1bd5b245b8a55353e60a2decad507ef8014be044 (diff) | |
download | bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.gz bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.zst bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.zip |
fix text decode trim (#4495)
* remove trim
* separate function
* a test
* trim when `stream` is true
---------
Co-authored-by: Jarred Sumner <jarred@jarredsumner.com>
-rw-r--r-- | src/bun.js/webcore/encoding.zig | 19 | ||||
-rw-r--r-- | src/string_immutable.zig | 115 | ||||
-rw-r--r-- | test/js/web/encoding/text-decoder.test.js | 14 |
3 files changed, 143 insertions, 5 deletions
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 6f31fef82..53933fdb7 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -592,14 +592,22 @@ pub const TextDecoder = struct { return JSValue.zero; }; - return this.decodeSlice(globalThis, array_buffer.slice()); + if (arguments.len > 1 and arguments[1].isObject()) { + if (arguments[1].get(globalThis, "stream")) |stream| { + if (stream.toBoolean()) { + return this.decodeSlice(globalThis, array_buffer.slice(), true); + } + } + } + + return this.decodeSlice(globalThis, array_buffer.slice(), false); } pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) callconv(.C) JSValue { - return this.decodeSlice(globalThis, uint8array.slice()); + return this.decodeSlice(globalThis, uint8array.slice(), false); } - fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8) JSValue { + fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime stream: bool) JSValue { switch (this.encoding) { EncodingLabel.latin1 => { if (strings.isAllASCII(buffer_slice)) { @@ -620,8 +628,9 @@ pub const TextDecoder = struct { return ZigString.toExternalU16(bytes.ptr, out.written, globalThis); }, EncodingLabel.@"UTF-8" => { + const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim; if (this.fatal) { - if (strings.toUTF16Alloc(default_allocator, buffer_slice, true)) |result_| { + if (toUTF16(default_allocator, buffer_slice, true)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -640,7 +649,7 @@ pub const TextDecoder = struct { } } } else { - if (strings.toUTF16Alloc(default_allocator, buffer_slice, false)) |result_| { + if (toUTF16(default_allocator, buffer_slice, false)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } diff --git a/src/string_immutable.zig b/src/string_immutable.zig index c62266c62..d2d71621f 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1415,6 +1415,121 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa return null; } +pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { + if (strings.firstNonASCII(bytes)) |i| { + const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: { + const out_length = bun.simdutf.length.utf16.from.utf8.le(bytes); + + if (out_length == 0) + break :simd null; + + var out = try allocator.alloc(u16, out_length); + log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length }); + + // avoid `.with_errors.le()` due to https://github.com/simdutf/simdutf/issues/213 + switch (bun.simdutf.convert.utf8.to.utf16.le(bytes, out)) { + 0 => { + if (comptime fail_if_invalid) { + allocator.free(out); + return error.InvalidByteSequence; + } + + break :simd .{ + .items = out[0..i], + .capacity = out.len, + .allocator = allocator, + }; + }, + else => return out, + } + } else null; + var output = output_ orelse fallback: { + var list = try std.ArrayList(u16).initCapacity(allocator, i + 2); + list.items.len = i; + strings.copyU8IntoU16(list.items, bytes[0..i]); + break :fallback list; + }; + errdefer output.deinit(); + + var remaining = bytes[i..]; + + { + const sequence: [4]u8 = switch (remaining.len) { + 0 => unreachable, + 1 => [_]u8{ remaining[0], 0, 0, 0 }, + 2 => [_]u8{ remaining[0], remaining[1], 0, 0 }, + 3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 }, + else => remaining[0..4].*, + }; + + const replacement = strings.convertUTF8BytesIntoUTF16(&sequence); + if (comptime fail_if_invalid) { + if (replacement.fail) { + if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement); + return error.InvalidByteSequence; + } + } + remaining = remaining[@max(replacement.len, 1)..]; + + //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + switch (replacement.code_point) { + 0...0xffff => |c| { + try output.append(@as(u16, @intCast(c))); + }, + else => |c| { + try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); + }, + } + } + + while (strings.firstNonASCII(remaining)) |j| { + const end = output.items.len; + try output.ensureUnusedCapacity(j); + output.items.len += j; + strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]); + remaining = remaining[j..]; + + const sequence: [4]u8 = switch (remaining.len) { + 0 => unreachable, + 1 => [_]u8{ remaining[0], 0, 0, 0 }, + 2 => [_]u8{ remaining[0], remaining[1], 0, 0 }, + 3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 }, + else => remaining[0..4].*, + }; + + const replacement = strings.convertUTF8BytesIntoUTF16(&sequence); + if (comptime fail_if_invalid) { + if (replacement.fail) { + if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement); + return error.InvalidByteSequence; + } + } + remaining = remaining[@max(replacement.len, 1)..]; + + //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + switch (replacement.code_point) { + 0...0xffff => |c| { + try output.append(@as(u16, @intCast(c))); + }, + else => |c| { + try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) }); + }, + } + } + + if (remaining.len > 0) { + try output.ensureTotalCapacityPrecise(output.items.len + remaining.len); + + output.items.len += remaining.len; + strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); + } + + return output.items; + } + + return null; +} + pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement { const c0 = @as(u21, input[0]); diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index 8ec097f31..4991cf361 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -233,6 +233,20 @@ describe("TextDecoder", () => { }).toThrow(TypeError); }); + it("should not trim invalid byte sequences when fatal is false", () => { + const buf = Buffer.from([77, 97, 110, 32, 208, 129, 240, 164, 173]); + const received = new TextDecoder("utf-8").decode(buf); + const expected = "Man Ё\ufffd"; + expect(received).toBe(expected); + }); + + it("should trim when stream is true", () => { + const buf = Buffer.from([77, 97, 110, 32, 208, 129, 240, 164, 173]); + const received = new TextDecoder("utf-8").decode(buf, { stream: true }); + const expected = "Man Ё"; + expect(received).toBe(expected); + }); + it("constructor should set values", () => { const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false }); expect(decoder.fatal).toBe(true); |