From a098c6e5f6d63a9bfd1430798fce4896e26f1a9f Mon Sep 17 00:00:00 2001 From: WingLim Date: Sun, 17 Sep 2023 13:41:52 +0800 Subject: feat(encoding): TextDecoder support undefined (#5387) * feat(encoding): TextDecoder support undefined * chore: format test file --- test/js/web/encoding/text-decoder.test.js | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'test/js/web/encoding/text-decoder.test.js') diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index 4991cf361..dabdb0936 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -258,6 +258,11 @@ describe("TextDecoder", () => { const decoder = new TextDecoder("utf-8", { fatal: 10, ignoreBOM: {} }); }).toThrow(); }); + + it("should support undifined", () => { + const decoder = new TextDecoder(undefined); + expect(decoder.encoding).toBe("utf-8"); + }); }); it("truncated sequences", () => { -- cgit v1.2.3 From 5f66b4e729105286863a13955b1ed8897b45210e Mon Sep 17 00:00:00 2001 From: WingLim Date: Thu, 21 Sep 2023 09:44:05 +0800 Subject: feat(encoding): support BOM detection (#5550) * fix(encoding): export `getIgnoreBOM` * feat(encoding): support ignoreBOM * fix(encoding): not replace BOM to 0xFFFD * chore: use strict equal --- src/bun.js/bindings/ZigGeneratedClasses.cpp | 18 +++++++++++++++++- src/bun.js/bindings/generated_classes.zig | 4 ++++ src/bun.js/webcore/encoding.classes.ts | 3 +++ src/bun.js/webcore/encoding.zig | 29 +++++++++++++++++++++++------ test/js/web/encoding/text-decoder.test.js | 25 ++++++++++++++++++++++++- 5 files changed, 71 insertions(+), 8 deletions(-) (limited to 'test/js/web/encoding/text-decoder.test.js') diff --git a/src/bun.js/bindings/ZigGeneratedClasses.cpp b/src/bun.js/bindings/ZigGeneratedClasses.cpp index b84c1cd16..b3451a058 100644 --- a/src/bun.js/bindings/ZigGeneratedClasses.cpp +++ b/src/bun.js/bindings/ZigGeneratedClasses.cpp @@ -26577,12 +26577,16 @@ JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__encodingGetterWrap); extern "C" JSC::EncodedJSValue TextDecoderPrototype__getFatal(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap); +extern "C" JSC::EncodedJSValue TextDecoderPrototype__getIgnoreBOM(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); +JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap); + STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTextDecoderPrototype, JSTextDecoderPrototype::Base); static const HashTableValue JSTextDecoderPrototypeTableValues[] = { { "decode"_s, static_cast(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::DOMJITFunctionType, TextDecoderPrototype__decodeCallback, &DOMJITSignatureForTextDecoderPrototype__decode } }, { "encoding"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__encodingGetterWrap, 0 } }, - { "fatal"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } } + { "fatal"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } }, + { "ignoreBOM"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__ignoreBOMGetterWrap, 0 } } }; const ClassInfo JSTextDecoderPrototype::s_info = { "TextDecoder"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSTextDecoderPrototype) }; @@ -26670,6 +26674,18 @@ JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap, (JSGlobalObject RELEASE_AND_RETURN(throwScope, result); } +JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName)) +{ + auto& vm = lexicalGlobalObject->vm(); + Zig::GlobalObject* globalObject = reinterpret_cast(lexicalGlobalObject); + auto throwScope = DECLARE_THROW_SCOPE(vm); + JSTextDecoder* thisObject = jsCast(JSValue::decode(thisValue)); + JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject); + JSC::EncodedJSValue result = TextDecoderPrototype__getIgnoreBOM(thisObject->wrapped(), globalObject); + RETURN_IF_EXCEPTION(throwScope, {}); + RELEASE_AND_RETURN(throwScope, result); +} + void JSTextDecoderPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject) { Base::finishCreation(vm); diff --git a/src/bun.js/bindings/generated_classes.zig b/src/bun.js/bindings/generated_classes.zig index 581d4a5f3..50170f998 100644 --- a/src/bun.js/bindings/generated_classes.zig +++ b/src/bun.js/bindings/generated_classes.zig @@ -6862,6 +6862,9 @@ pub const JSTextDecoder = struct { if (@TypeOf(TextDecoder.getFatal) != GetterType) @compileLog("Expected TextDecoder.getFatal to be a getter"); + if (@TypeOf(TextDecoder.getIgnoreBOM) != GetterType) + @compileLog("Expected TextDecoder.getIgnoreBOM to be a getter"); + if (!JSC.is_bindgen) { @export(TextDecoder.constructor, .{ .name = "TextDecoderClass__construct" }); @export(TextDecoder.decode, .{ .name = "TextDecoderPrototype__decode" }); @@ -6869,6 +6872,7 @@ pub const JSTextDecoder = struct { @export(TextDecoder.finalize, .{ .name = "TextDecoderClass__finalize" }); @export(TextDecoder.getEncoding, .{ .name = "TextDecoderPrototype__getEncoding" }); @export(TextDecoder.getFatal, .{ .name = "TextDecoderPrototype__getFatal" }); + @export(TextDecoder.getIgnoreBOM, .{ .name = "TextDecoderPrototype__getIgnoreBOM" }); } } }; diff --git a/src/bun.js/webcore/encoding.classes.ts b/src/bun.js/webcore/encoding.classes.ts index 118dfd09e..7114f210e 100644 --- a/src/bun.js/webcore/encoding.classes.ts +++ b/src/bun.js/webcore/encoding.classes.ts @@ -16,6 +16,9 @@ export default [ fatal: { getter: "getFatal", }, + ignoreBOM: { + getter: "getIgnoreBOM", + }, decode: { fn: "decode", diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 8ffbd3fd0..ca0f44e6a 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -559,6 +559,13 @@ pub const TextDecoder = struct { remainder = remainder[1..]; continue; }, + // BOM handling + 0xFEFF => { + buffer.ensureTotalCapacity(allocator, 1) catch unreachable; + buffer.items.ptr[buffer.items.len] = remainder[0]; + buffer.items.len += 1; + remainder = remainder[1..]; + }, // Is this an unpaired low surrogate or four-digit hex escape? else => { @@ -629,8 +636,13 @@ pub const TextDecoder = struct { }, EncodingLabel.@"UTF-8" => { const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim; + const moved_buffer_slice_8 = if (!this.ignore_bom and buffer_slice.len > 3 and std.mem.eql(u8, &[_]u8{ '\xEF', '\xBB', '\xBF' }, buffer_slice[0..3])) + buffer_slice[3..] + else + buffer_slice; + if (this.fatal) { - if (toUTF16(default_allocator, buffer_slice, true)) |result_| { + if (toUTF16(default_allocator, moved_buffer_slice_8, true)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -649,7 +661,7 @@ pub const TextDecoder = struct { } } } else { - if (toUTF16(default_allocator, buffer_slice, false)) |result_| { + if (toUTF16(default_allocator, moved_buffer_slice_8, false)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -664,15 +676,20 @@ pub const TextDecoder = struct { } // Experiment: using mimalloc directly is slightly slower - return ZigString.init(buffer_slice).toValueGC(globalThis); + return ZigString.init(moved_buffer_slice_8).toValueGC(globalThis); }, EncodingLabel.@"UTF-16LE" => { - if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) { - return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, buffer_slice))), globalThis); + const moved_buffer_slice_16 = if (!this.ignore_bom and buffer_slice.len > 2 and std.mem.eql(u8, &[_]u8{ '\xFF', '\xFE' }, buffer_slice[0..2])) + buffer_slice[2..] + else + buffer_slice; + + if (std.mem.isAligned(@intFromPtr(moved_buffer_slice_16.ptr), @alignOf([*]const u16))) { + return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, moved_buffer_slice_16))), globalThis); } - return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, buffer_slice), globalThis); + return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, moved_buffer_slice_16), globalThis); }, else => { globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}); diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index dabdb0936..de71fb351 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -250,7 +250,7 @@ describe("TextDecoder", () => { it("constructor should set values", () => { const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false }); expect(decoder.fatal).toBe(true); - // expect(decoder.ignoreBOM).toBe(false); // currently the getter for ignoreBOM doesn't work and always returns undefined + expect(decoder.ignoreBOM).toBe(false); }); it("should throw on invalid input", () => { @@ -265,6 +265,29 @@ describe("TextDecoder", () => { }); }); +describe("TextDecoder ignoreBOM", () => { + + it.each([ + { + encoding: 'utf-8', + bytes: [0xEF, 0xBB, 0xBF, 0x61, 0x62, 0x63] + }, + { + encoding: 'utf-16le', + bytes: [0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00] + } + ])('should ignoreBOM for: %o', ({encoding, bytes}) => { + const BOM = '\uFEFF'; + const array = new Uint8Array(bytes); + + const decoder_ignore_bom = new TextDecoder(encoding, {ignoreBOM: true}); + expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`); + + const decoder_not_ignore_bom = new TextDecoder(encoding, {ignoreBOM: false}); + expect(decoder_not_ignore_bom.decode(array)).toStrictEqual('abc'); + }); +}); + it("truncated sequences", () => { const assert_equals = (a, b) => expect(a).toBe(b); -- cgit v1.2.3 From 01d2cb5d98c6e90507d1dfba962276cf2edf9ad7 Mon Sep 17 00:00:00 2001 From: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com> Date: Thu, 21 Sep 2023 00:51:48 -0700 Subject: Prettier --- test/js/web/encoding/text-decoder.test.js | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'test/js/web/encoding/text-decoder.test.js') diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index de71fb351..3685a5f6d 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -266,25 +266,24 @@ describe("TextDecoder", () => { }); describe("TextDecoder ignoreBOM", () => { - it.each([ { - encoding: 'utf-8', - bytes: [0xEF, 0xBB, 0xBF, 0x61, 0x62, 0x63] + encoding: "utf-8", + bytes: [0xef, 0xbb, 0xbf, 0x61, 0x62, 0x63], }, { - encoding: 'utf-16le', - bytes: [0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00] - } - ])('should ignoreBOM for: %o', ({encoding, bytes}) => { - const BOM = '\uFEFF'; + encoding: "utf-16le", + bytes: [0xff, 0xfe, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00], + }, + ])("should ignoreBOM for: %o", ({ encoding, bytes }) => { + const BOM = "\uFEFF"; const array = new Uint8Array(bytes); - const decoder_ignore_bom = new TextDecoder(encoding, {ignoreBOM: true}); + const decoder_ignore_bom = new TextDecoder(encoding, { ignoreBOM: true }); expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`); - const decoder_not_ignore_bom = new TextDecoder(encoding, {ignoreBOM: false}); - expect(decoder_not_ignore_bom.decode(array)).toStrictEqual('abc'); + const decoder_not_ignore_bom = new TextDecoder(encoding, { ignoreBOM: false }); + expect(decoder_not_ignore_bom.decode(array)).toStrictEqual("abc"); }); }); -- cgit v1.2.3 From abfc10afeb73f9447e47929359d37f2b488c3c81 Mon Sep 17 00:00:00 2001 From: Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com> Date: Thu, 21 Sep 2023 07:10:07 -0700 Subject: Revert "feat(encoding): support BOM detection (#5550)" This reverts commit 5f66b4e729105286863a13955b1ed8897b45210e. This caused test failures in text-encoder. cc @WingLim --- src/bun.js/bindings/ZigGeneratedClasses.cpp | 18 +----------------- src/bun.js/bindings/generated_classes.zig | 4 ---- src/bun.js/webcore/encoding.classes.ts | 3 --- src/bun.js/webcore/encoding.zig | 29 ++++++----------------------- test/js/web/encoding/text-decoder.test.js | 24 +----------------------- 5 files changed, 8 insertions(+), 70 deletions(-) (limited to 'test/js/web/encoding/text-decoder.test.js') diff --git a/src/bun.js/bindings/ZigGeneratedClasses.cpp b/src/bun.js/bindings/ZigGeneratedClasses.cpp index b3451a058..b84c1cd16 100644 --- a/src/bun.js/bindings/ZigGeneratedClasses.cpp +++ b/src/bun.js/bindings/ZigGeneratedClasses.cpp @@ -26577,16 +26577,12 @@ JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__encodingGetterWrap); extern "C" JSC::EncodedJSValue TextDecoderPrototype__getFatal(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap); -extern "C" JSC::EncodedJSValue TextDecoderPrototype__getIgnoreBOM(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); -JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap); - STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTextDecoderPrototype, JSTextDecoderPrototype::Base); static const HashTableValue JSTextDecoderPrototypeTableValues[] = { { "decode"_s, static_cast(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::DOMJITFunctionType, TextDecoderPrototype__decodeCallback, &DOMJITSignatureForTextDecoderPrototype__decode } }, { "encoding"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__encodingGetterWrap, 0 } }, - { "fatal"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } }, - { "ignoreBOM"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__ignoreBOMGetterWrap, 0 } } + { "fatal"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } } }; const ClassInfo JSTextDecoderPrototype::s_info = { "TextDecoder"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSTextDecoderPrototype) }; @@ -26674,18 +26670,6 @@ JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap, (JSGlobalObject RELEASE_AND_RETURN(throwScope, result); } -JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName)) -{ - auto& vm = lexicalGlobalObject->vm(); - Zig::GlobalObject* globalObject = reinterpret_cast(lexicalGlobalObject); - auto throwScope = DECLARE_THROW_SCOPE(vm); - JSTextDecoder* thisObject = jsCast(JSValue::decode(thisValue)); - JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject); - JSC::EncodedJSValue result = TextDecoderPrototype__getIgnoreBOM(thisObject->wrapped(), globalObject); - RETURN_IF_EXCEPTION(throwScope, {}); - RELEASE_AND_RETURN(throwScope, result); -} - void JSTextDecoderPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject) { Base::finishCreation(vm); diff --git a/src/bun.js/bindings/generated_classes.zig b/src/bun.js/bindings/generated_classes.zig index 50170f998..581d4a5f3 100644 --- a/src/bun.js/bindings/generated_classes.zig +++ b/src/bun.js/bindings/generated_classes.zig @@ -6862,9 +6862,6 @@ pub const JSTextDecoder = struct { if (@TypeOf(TextDecoder.getFatal) != GetterType) @compileLog("Expected TextDecoder.getFatal to be a getter"); - if (@TypeOf(TextDecoder.getIgnoreBOM) != GetterType) - @compileLog("Expected TextDecoder.getIgnoreBOM to be a getter"); - if (!JSC.is_bindgen) { @export(TextDecoder.constructor, .{ .name = "TextDecoderClass__construct" }); @export(TextDecoder.decode, .{ .name = "TextDecoderPrototype__decode" }); @@ -6872,7 +6869,6 @@ pub const JSTextDecoder = struct { @export(TextDecoder.finalize, .{ .name = "TextDecoderClass__finalize" }); @export(TextDecoder.getEncoding, .{ .name = "TextDecoderPrototype__getEncoding" }); @export(TextDecoder.getFatal, .{ .name = "TextDecoderPrototype__getFatal" }); - @export(TextDecoder.getIgnoreBOM, .{ .name = "TextDecoderPrototype__getIgnoreBOM" }); } } }; diff --git a/src/bun.js/webcore/encoding.classes.ts b/src/bun.js/webcore/encoding.classes.ts index 7114f210e..118dfd09e 100644 --- a/src/bun.js/webcore/encoding.classes.ts +++ b/src/bun.js/webcore/encoding.classes.ts @@ -16,9 +16,6 @@ export default [ fatal: { getter: "getFatal", }, - ignoreBOM: { - getter: "getIgnoreBOM", - }, decode: { fn: "decode", diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index ca0f44e6a..8ffbd3fd0 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -559,13 +559,6 @@ pub const TextDecoder = struct { remainder = remainder[1..]; continue; }, - // BOM handling - 0xFEFF => { - buffer.ensureTotalCapacity(allocator, 1) catch unreachable; - buffer.items.ptr[buffer.items.len] = remainder[0]; - buffer.items.len += 1; - remainder = remainder[1..]; - }, // Is this an unpaired low surrogate or four-digit hex escape? else => { @@ -636,13 +629,8 @@ pub const TextDecoder = struct { }, EncodingLabel.@"UTF-8" => { const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim; - const moved_buffer_slice_8 = if (!this.ignore_bom and buffer_slice.len > 3 and std.mem.eql(u8, &[_]u8{ '\xEF', '\xBB', '\xBF' }, buffer_slice[0..3])) - buffer_slice[3..] - else - buffer_slice; - if (this.fatal) { - if (toUTF16(default_allocator, moved_buffer_slice_8, true)) |result_| { + if (toUTF16(default_allocator, buffer_slice, true)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -661,7 +649,7 @@ pub const TextDecoder = struct { } } } else { - if (toUTF16(default_allocator, moved_buffer_slice_8, false)) |result_| { + if (toUTF16(default_allocator, buffer_slice, false)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -676,20 +664,15 @@ pub const TextDecoder = struct { } // Experiment: using mimalloc directly is slightly slower - return ZigString.init(moved_buffer_slice_8).toValueGC(globalThis); + return ZigString.init(buffer_slice).toValueGC(globalThis); }, EncodingLabel.@"UTF-16LE" => { - const moved_buffer_slice_16 = if (!this.ignore_bom and buffer_slice.len > 2 and std.mem.eql(u8, &[_]u8{ '\xFF', '\xFE' }, buffer_slice[0..2])) - buffer_slice[2..] - else - buffer_slice; - - if (std.mem.isAligned(@intFromPtr(moved_buffer_slice_16.ptr), @alignOf([*]const u16))) { - return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, moved_buffer_slice_16))), globalThis); + if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) { + return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, buffer_slice))), globalThis); } - return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, moved_buffer_slice_16), globalThis); + return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, buffer_slice), globalThis); }, else => { globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}); diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index 3685a5f6d..dabdb0936 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -250,7 +250,7 @@ describe("TextDecoder", () => { it("constructor should set values", () => { const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false }); expect(decoder.fatal).toBe(true); - expect(decoder.ignoreBOM).toBe(false); + // expect(decoder.ignoreBOM).toBe(false); // currently the getter for ignoreBOM doesn't work and always returns undefined }); it("should throw on invalid input", () => { @@ -265,28 +265,6 @@ describe("TextDecoder", () => { }); }); -describe("TextDecoder ignoreBOM", () => { - it.each([ - { - encoding: "utf-8", - bytes: [0xef, 0xbb, 0xbf, 0x61, 0x62, 0x63], - }, - { - encoding: "utf-16le", - bytes: [0xff, 0xfe, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00], - }, - ])("should ignoreBOM for: %o", ({ encoding, bytes }) => { - const BOM = "\uFEFF"; - const array = new Uint8Array(bytes); - - const decoder_ignore_bom = new TextDecoder(encoding, { ignoreBOM: true }); - expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`); - - const decoder_not_ignore_bom = new TextDecoder(encoding, { ignoreBOM: false }); - expect(decoder_not_ignore_bom.decode(array)).toStrictEqual("abc"); - }); -}); - it("truncated sequences", () => { const assert_equals = (a, b) => expect(a).toBe(b); -- cgit v1.2.3 From 476fa4deda73ce3d63a2fb8175e66678434668d6 Mon Sep 17 00:00:00 2001 From: WingLim Date: Wed, 4 Oct 2023 01:28:59 +0800 Subject: feat(encoding): support BOM detection with test passed (#6074) --- src/bun.js/bindings/ZigGeneratedClasses.cpp | 18 +++++++++++++++++- src/bun.js/bindings/generated_classes.zig | 4 ++++ src/bun.js/webcore/encoding.classes.ts | 3 +++ src/bun.js/webcore/encoding.zig | 29 +++++++++++++++++++++++------ test/js/web/encoding/text-decoder.test.js | 24 +++++++++++++++++++++++- test/js/web/encoding/text-encoder.test.js | 2 +- 6 files changed, 71 insertions(+), 9 deletions(-) (limited to 'test/js/web/encoding/text-decoder.test.js') diff --git a/src/bun.js/bindings/ZigGeneratedClasses.cpp b/src/bun.js/bindings/ZigGeneratedClasses.cpp index 45d60d379..eff4b9c7d 100644 --- a/src/bun.js/bindings/ZigGeneratedClasses.cpp +++ b/src/bun.js/bindings/ZigGeneratedClasses.cpp @@ -26705,12 +26705,16 @@ JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__encodingGetterWrap); extern "C" JSC::EncodedJSValue TextDecoderPrototype__getFatal(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap); +extern "C" JSC::EncodedJSValue TextDecoderPrototype__getIgnoreBOM(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); +JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap); + STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTextDecoderPrototype, JSTextDecoderPrototype::Base); static const HashTableValue JSTextDecoderPrototypeTableValues[] = { { "decode"_s, static_cast(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::DOMJITFunctionType, TextDecoderPrototype__decodeCallback, &DOMJITSignatureForTextDecoderPrototype__decode } }, { "encoding"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__encodingGetterWrap, 0 } }, - { "fatal"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } } + { "fatal"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } }, + { "ignoreBOM"_s, static_cast(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__ignoreBOMGetterWrap, 0 } } }; const ClassInfo JSTextDecoderPrototype::s_info = { "TextDecoder"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSTextDecoderPrototype) }; @@ -26798,6 +26802,18 @@ JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap, (JSGlobalObject RELEASE_AND_RETURN(throwScope, result); } +JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName)) +{ + auto& vm = lexicalGlobalObject->vm(); + Zig::GlobalObject* globalObject = reinterpret_cast(lexicalGlobalObject); + auto throwScope = DECLARE_THROW_SCOPE(vm); + JSTextDecoder* thisObject = jsCast(JSValue::decode(thisValue)); + JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject); + JSC::EncodedJSValue result = TextDecoderPrototype__getIgnoreBOM(thisObject->wrapped(), globalObject); + RETURN_IF_EXCEPTION(throwScope, {}); + RELEASE_AND_RETURN(throwScope, result); +} + void JSTextDecoderPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject) { Base::finishCreation(vm); diff --git a/src/bun.js/bindings/generated_classes.zig b/src/bun.js/bindings/generated_classes.zig index da93b2706..fc1aa2be6 100644 --- a/src/bun.js/bindings/generated_classes.zig +++ b/src/bun.js/bindings/generated_classes.zig @@ -6874,6 +6874,9 @@ pub const JSTextDecoder = struct { if (@TypeOf(TextDecoder.getFatal) != GetterType) @compileLog("Expected TextDecoder.getFatal to be a getter"); + if (@TypeOf(TextDecoder.getIgnoreBOM) != GetterType) + @compileLog("Expected TextDecoder.getIgnoreBOM to be a getter"); + if (!JSC.is_bindgen) { @export(TextDecoder.constructor, .{ .name = "TextDecoderClass__construct" }); @export(TextDecoder.decode, .{ .name = "TextDecoderPrototype__decode" }); @@ -6881,6 +6884,7 @@ pub const JSTextDecoder = struct { @export(TextDecoder.finalize, .{ .name = "TextDecoderClass__finalize" }); @export(TextDecoder.getEncoding, .{ .name = "TextDecoderPrototype__getEncoding" }); @export(TextDecoder.getFatal, .{ .name = "TextDecoderPrototype__getFatal" }); + @export(TextDecoder.getIgnoreBOM, .{ .name = "TextDecoderPrototype__getIgnoreBOM" }); } } }; diff --git a/src/bun.js/webcore/encoding.classes.ts b/src/bun.js/webcore/encoding.classes.ts index 118dfd09e..7114f210e 100644 --- a/src/bun.js/webcore/encoding.classes.ts +++ b/src/bun.js/webcore/encoding.classes.ts @@ -16,6 +16,9 @@ export default [ fatal: { getter: "getFatal", }, + ignoreBOM: { + getter: "getIgnoreBOM", + }, decode: { fn: "decode", diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 8ffbd3fd0..ca0f44e6a 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -559,6 +559,13 @@ pub const TextDecoder = struct { remainder = remainder[1..]; continue; }, + // BOM handling + 0xFEFF => { + buffer.ensureTotalCapacity(allocator, 1) catch unreachable; + buffer.items.ptr[buffer.items.len] = remainder[0]; + buffer.items.len += 1; + remainder = remainder[1..]; + }, // Is this an unpaired low surrogate or four-digit hex escape? else => { @@ -629,8 +636,13 @@ pub const TextDecoder = struct { }, EncodingLabel.@"UTF-8" => { const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim; + const moved_buffer_slice_8 = if (!this.ignore_bom and buffer_slice.len > 3 and std.mem.eql(u8, &[_]u8{ '\xEF', '\xBB', '\xBF' }, buffer_slice[0..3])) + buffer_slice[3..] + else + buffer_slice; + if (this.fatal) { - if (toUTF16(default_allocator, buffer_slice, true)) |result_| { + if (toUTF16(default_allocator, moved_buffer_slice_8, true)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -649,7 +661,7 @@ pub const TextDecoder = struct { } } } else { - if (toUTF16(default_allocator, buffer_slice, false)) |result_| { + if (toUTF16(default_allocator, moved_buffer_slice_8, false)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -664,15 +676,20 @@ pub const TextDecoder = struct { } // Experiment: using mimalloc directly is slightly slower - return ZigString.init(buffer_slice).toValueGC(globalThis); + return ZigString.init(moved_buffer_slice_8).toValueGC(globalThis); }, EncodingLabel.@"UTF-16LE" => { - if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) { - return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, buffer_slice))), globalThis); + const moved_buffer_slice_16 = if (!this.ignore_bom and buffer_slice.len > 2 and std.mem.eql(u8, &[_]u8{ '\xFF', '\xFE' }, buffer_slice[0..2])) + buffer_slice[2..] + else + buffer_slice; + + if (std.mem.isAligned(@intFromPtr(moved_buffer_slice_16.ptr), @alignOf([*]const u16))) { + return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, moved_buffer_slice_16))), globalThis); } - return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, buffer_slice), globalThis); + return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, moved_buffer_slice_16), globalThis); }, else => { globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}); diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index dabdb0936..3685a5f6d 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -250,7 +250,7 @@ describe("TextDecoder", () => { it("constructor should set values", () => { const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false }); expect(decoder.fatal).toBe(true); - // expect(decoder.ignoreBOM).toBe(false); // currently the getter for ignoreBOM doesn't work and always returns undefined + expect(decoder.ignoreBOM).toBe(false); }); it("should throw on invalid input", () => { @@ -265,6 +265,28 @@ describe("TextDecoder", () => { }); }); +describe("TextDecoder ignoreBOM", () => { + it.each([ + { + encoding: "utf-8", + bytes: [0xef, 0xbb, 0xbf, 0x61, 0x62, 0x63], + }, + { + encoding: "utf-16le", + bytes: [0xff, 0xfe, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00], + }, + ])("should ignoreBOM for: %o", ({ encoding, bytes }) => { + const BOM = "\uFEFF"; + const array = new Uint8Array(bytes); + + const decoder_ignore_bom = new TextDecoder(encoding, { ignoreBOM: true }); + expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`); + + const decoder_not_ignore_bom = new TextDecoder(encoding, { ignoreBOM: false }); + expect(decoder_not_ignore_bom.decode(array)).toStrictEqual("abc"); + }); +}); + it("truncated sequences", () => { const assert_equals = (a, b) => expect(a).toBe(b); diff --git a/test/js/web/encoding/text-encoder.test.js b/test/js/web/encoding/text-encoder.test.js index 1bf2057bc..78940a6eb 100644 --- a/test/js/web/encoding/text-encoder.test.js +++ b/test/js/web/encoding/text-encoder.test.js @@ -111,7 +111,7 @@ describe("TextEncoder", () => { const fixture = new Uint8Array(await Bun.file(import.meta.dir + "/utf8-encoding-fixture.bin").arrayBuffer()); const length = 0x110000; let textEncoder = new TextEncoder(); - let textDecoder = new TextDecoder(); + let textDecoder = new TextDecoder("utf-8", { ignoreBOM: true }); let encodeOut = new Uint8Array(length * 4); let encodeIntoOut = new Uint8Array(length * 4); let encodeIntoBuffer = new Uint8Array(4); -- cgit v1.2.3