aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-09-05 17:53:31 -0700
committerGravatar GitHub <noreply@github.com> 2023-09-05 17:53:31 -0700
commit70a5cfe9087c9836cef06ee3560f6111ed2fe18e (patch)
treea8968f64234384cda482ae7e85beb7a9c3f4e7e6
parent1bd5b245b8a55353e60a2decad507ef8014be044 (diff)
downloadbun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.gz
bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.tar.zst
bun-70a5cfe9087c9836cef06ee3560f6111ed2fe18e.zip
fix text decode trim (#4495)
* remove trim * separate function * a test * trim when `stream` is true --------- Co-authored-by: Jarred Sumner <jarred@jarredsumner.com>
-rw-r--r--src/bun.js/webcore/encoding.zig19
-rw-r--r--src/string_immutable.zig115
-rw-r--r--test/js/web/encoding/text-decoder.test.js14
3 files changed, 143 insertions, 5 deletions
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index 6f31fef82..53933fdb7 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -592,14 +592,22 @@ pub const TextDecoder = struct {
return JSValue.zero;
};
- return this.decodeSlice(globalThis, array_buffer.slice());
+ if (arguments.len > 1 and arguments[1].isObject()) {
+ if (arguments[1].get(globalThis, "stream")) |stream| {
+ if (stream.toBoolean()) {
+ return this.decodeSlice(globalThis, array_buffer.slice(), true);
+ }
+ }
+ }
+
+ return this.decodeSlice(globalThis, array_buffer.slice(), false);
}
pub fn decodeWithoutTypeChecks(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, uint8array: *JSC.JSUint8Array) callconv(.C) JSValue {
- return this.decodeSlice(globalThis, uint8array.slice());
+ return this.decodeSlice(globalThis, uint8array.slice(), false);
}
- fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8) JSValue {
+ fn decodeSlice(this: *TextDecoder, globalThis: *JSC.JSGlobalObject, buffer_slice: []const u8, comptime stream: bool) JSValue {
switch (this.encoding) {
EncodingLabel.latin1 => {
if (strings.isAllASCII(buffer_slice)) {
@@ -620,8 +628,9 @@ pub const TextDecoder = struct {
return ZigString.toExternalU16(bytes.ptr, out.written, globalThis);
},
EncodingLabel.@"UTF-8" => {
+ const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim;
if (this.fatal) {
- if (strings.toUTF16Alloc(default_allocator, buffer_slice, true)) |result_| {
+ if (toUTF16(default_allocator, buffer_slice, true)) |result_| {
if (result_) |result| {
return ZigString.toExternalU16(result.ptr, result.len, globalThis);
}
@@ -640,7 +649,7 @@ pub const TextDecoder = struct {
}
}
} else {
- if (strings.toUTF16Alloc(default_allocator, buffer_slice, false)) |result_| {
+ if (toUTF16(default_allocator, buffer_slice, false)) |result_| {
if (result_) |result| {
return ZigString.toExternalU16(result.ptr, result.len, globalThis);
}
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index c62266c62..d2d71621f 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1415,6 +1415,121 @@ pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fa
return null;
}
+pub fn toUTF16AllocNoTrim(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 {
+ if (strings.firstNonASCII(bytes)) |i| {
+ const output_: ?std.ArrayList(u16) = if (comptime bun.FeatureFlags.use_simdutf) simd: {
+ const out_length = bun.simdutf.length.utf16.from.utf8.le(bytes);
+
+ if (out_length == 0)
+ break :simd null;
+
+ var out = try allocator.alloc(u16, out_length);
+ log("toUTF16 {d} UTF8 -> {d} UTF16", .{ bytes.len, out_length });
+
+ // avoid `.with_errors.le()` due to https://github.com/simdutf/simdutf/issues/213
+ switch (bun.simdutf.convert.utf8.to.utf16.le(bytes, out)) {
+ 0 => {
+ if (comptime fail_if_invalid) {
+ allocator.free(out);
+ return error.InvalidByteSequence;
+ }
+
+ break :simd .{
+ .items = out[0..i],
+ .capacity = out.len,
+ .allocator = allocator,
+ };
+ },
+ else => return out,
+ }
+ } else null;
+ var output = output_ orelse fallback: {
+ var list = try std.ArrayList(u16).initCapacity(allocator, i + 2);
+ list.items.len = i;
+ strings.copyU8IntoU16(list.items, bytes[0..i]);
+ break :fallback list;
+ };
+ errdefer output.deinit();
+
+ var remaining = bytes[i..];
+
+ {
+ const sequence: [4]u8 = switch (remaining.len) {
+ 0 => unreachable,
+ 1 => [_]u8{ remaining[0], 0, 0, 0 },
+ 2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
+ 3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
+ else => remaining[0..4].*,
+ };
+
+ const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+ if (comptime fail_if_invalid) {
+ if (replacement.fail) {
+ if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
+ return error.InvalidByteSequence;
+ }
+ }
+ remaining = remaining[@max(replacement.len, 1)..];
+
+ //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+ switch (replacement.code_point) {
+ 0...0xffff => |c| {
+ try output.append(@as(u16, @intCast(c)));
+ },
+ else => |c| {
+ try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
+ },
+ }
+ }
+
+ while (strings.firstNonASCII(remaining)) |j| {
+ const end = output.items.len;
+ try output.ensureUnusedCapacity(j);
+ output.items.len += j;
+ strings.copyU8IntoU16(output.items[end..][0..j], remaining[0..j]);
+ remaining = remaining[j..];
+
+ const sequence: [4]u8 = switch (remaining.len) {
+ 0 => unreachable,
+ 1 => [_]u8{ remaining[0], 0, 0, 0 },
+ 2 => [_]u8{ remaining[0], remaining[1], 0, 0 },
+ 3 => [_]u8{ remaining[0], remaining[1], remaining[2], 0 },
+ else => remaining[0..4].*,
+ };
+
+ const replacement = strings.convertUTF8BytesIntoUTF16(&sequence);
+ if (comptime fail_if_invalid) {
+ if (replacement.fail) {
+ if (comptime Environment.allow_assert) std.debug.assert(replacement.code_point == unicode_replacement);
+ return error.InvalidByteSequence;
+ }
+ }
+ remaining = remaining[@max(replacement.len, 1)..];
+
+ //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+ switch (replacement.code_point) {
+ 0...0xffff => |c| {
+ try output.append(@as(u16, @intCast(c)));
+ },
+ else => |c| {
+ try output.appendSlice(&[_]u16{ strings.u16Lead(c), strings.u16Trail(c) });
+ },
+ }
+ }
+
+ if (remaining.len > 0) {
+ try output.ensureTotalCapacityPrecise(output.items.len + remaining.len);
+
+ output.items.len += remaining.len;
+ strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining);
+ }
+
+ return output.items;
+ }
+
+ return null;
+}
+
pub fn utf16CodepointWithFFFD(comptime Type: type, input: Type) UTF16Replacement {
const c0 = @as(u21, input[0]);
diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js
index 8ec097f31..4991cf361 100644
--- a/test/js/web/encoding/text-decoder.test.js
+++ b/test/js/web/encoding/text-decoder.test.js
@@ -233,6 +233,20 @@ describe("TextDecoder", () => {
}).toThrow(TypeError);
});
+ it("should not trim invalid byte sequences when fatal is false", () => {
+ const buf = Buffer.from([77, 97, 110, 32, 208, 129, 240, 164, 173]);
+ const received = new TextDecoder("utf-8").decode(buf);
+ const expected = "Man Ё\ufffd";
+ expect(received).toBe(expected);
+ });
+
+ it("should trim when stream is true", () => {
+ const buf = Buffer.from([77, 97, 110, 32, 208, 129, 240, 164, 173]);
+ const received = new TextDecoder("utf-8").decode(buf, { stream: true });
+ const expected = "Man Ё";
+ expect(received).toBe(expected);
+ });
+
it("constructor should set values", () => {
const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false });
expect(decoder.fatal).toBe(true);