diff options
author | 2023-09-21 09:44:05 +0800 | |
---|---|---|
committer | 2023-09-20 18:44:05 -0700 | |
commit | 5f66b4e729105286863a13955b1ed8897b45210e (patch) | |
tree | cec6d517bac47ccd52f7e8aeef5f99fb98412008 | |
parent | 7319142fd866d8314364d769f401a492892f7d63 (diff) | |
download | bun-5f66b4e729105286863a13955b1ed8897b45210e.tar.gz bun-5f66b4e729105286863a13955b1ed8897b45210e.tar.zst bun-5f66b4e729105286863a13955b1ed8897b45210e.zip |
feat(encoding): support BOM detection (#5550)
* fix(encoding): export `getIgnoreBOM`
* feat(encoding): support ignoreBOM
* fix(encoding): not replace BOM to 0xFFFD
* chore: use strict equal
-rw-r--r-- | src/bun.js/bindings/ZigGeneratedClasses.cpp | 18 | ||||
-rw-r--r-- | src/bun.js/bindings/generated_classes.zig | 4 | ||||
-rw-r--r-- | src/bun.js/webcore/encoding.classes.ts | 3 | ||||
-rw-r--r-- | src/bun.js/webcore/encoding.zig | 29 | ||||
-rw-r--r-- | test/js/web/encoding/text-decoder.test.js | 25 |
5 files changed, 71 insertions, 8 deletions
diff --git a/src/bun.js/bindings/ZigGeneratedClasses.cpp b/src/bun.js/bindings/ZigGeneratedClasses.cpp index b84c1cd16..b3451a058 100644 --- a/src/bun.js/bindings/ZigGeneratedClasses.cpp +++ b/src/bun.js/bindings/ZigGeneratedClasses.cpp @@ -26577,12 +26577,16 @@ JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__encodingGetterWrap); extern "C" JSC::EncodedJSValue TextDecoderPrototype__getFatal(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap); +extern "C" JSC::EncodedJSValue TextDecoderPrototype__getIgnoreBOM(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject); +JSC_DECLARE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap); + STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTextDecoderPrototype, JSTextDecoderPrototype::Base); static const HashTableValue JSTextDecoderPrototypeTableValues[] = { { "decode"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::DOMJITFunctionType, TextDecoderPrototype__decodeCallback, &DOMJITSignatureForTextDecoderPrototype__decode } }, { "encoding"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__encodingGetterWrap, 0 } }, - { "fatal"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } } + { "fatal"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__fatalGetterWrap, 0 } }, + { "ignoreBOM"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TextDecoderPrototype__ignoreBOMGetterWrap, 0 } } }; const ClassInfo JSTextDecoderPrototype::s_info = { "TextDecoder"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSTextDecoderPrototype) }; @@ -26670,6 +26674,18 @@ JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__fatalGetterWrap, (JSGlobalObject RELEASE_AND_RETURN(throwScope, result); } +JSC_DEFINE_CUSTOM_GETTER(TextDecoderPrototype__ignoreBOMGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName)) +{ + auto& vm = lexicalGlobalObject->vm(); + Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject); + auto throwScope = DECLARE_THROW_SCOPE(vm); + JSTextDecoder* thisObject = jsCast<JSTextDecoder*>(JSValue::decode(thisValue)); + JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject); + JSC::EncodedJSValue result = TextDecoderPrototype__getIgnoreBOM(thisObject->wrapped(), globalObject); + RETURN_IF_EXCEPTION(throwScope, {}); + RELEASE_AND_RETURN(throwScope, result); +} + void JSTextDecoderPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject) { Base::finishCreation(vm); diff --git a/src/bun.js/bindings/generated_classes.zig b/src/bun.js/bindings/generated_classes.zig index 581d4a5f3..50170f998 100644 --- a/src/bun.js/bindings/generated_classes.zig +++ b/src/bun.js/bindings/generated_classes.zig @@ -6862,6 +6862,9 @@ pub const JSTextDecoder = struct { if (@TypeOf(TextDecoder.getFatal) != GetterType) @compileLog("Expected TextDecoder.getFatal to be a getter"); + if (@TypeOf(TextDecoder.getIgnoreBOM) != GetterType) + @compileLog("Expected TextDecoder.getIgnoreBOM to be a getter"); + if (!JSC.is_bindgen) { @export(TextDecoder.constructor, .{ .name = "TextDecoderClass__construct" }); @export(TextDecoder.decode, .{ .name = "TextDecoderPrototype__decode" }); @@ -6869,6 +6872,7 @@ pub const JSTextDecoder = struct { @export(TextDecoder.finalize, .{ .name = "TextDecoderClass__finalize" }); @export(TextDecoder.getEncoding, .{ .name = "TextDecoderPrototype__getEncoding" }); @export(TextDecoder.getFatal, .{ .name = "TextDecoderPrototype__getFatal" }); + @export(TextDecoder.getIgnoreBOM, .{ .name = "TextDecoderPrototype__getIgnoreBOM" }); } } }; diff --git a/src/bun.js/webcore/encoding.classes.ts b/src/bun.js/webcore/encoding.classes.ts index 118dfd09e..7114f210e 100644 --- a/src/bun.js/webcore/encoding.classes.ts +++ b/src/bun.js/webcore/encoding.classes.ts @@ -16,6 +16,9 @@ export default [ fatal: { getter: "getFatal", }, + ignoreBOM: { + getter: "getIgnoreBOM", + }, decode: { fn: "decode", diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig index 8ffbd3fd0..ca0f44e6a 100644 --- a/src/bun.js/webcore/encoding.zig +++ b/src/bun.js/webcore/encoding.zig @@ -559,6 +559,13 @@ pub const TextDecoder = struct { remainder = remainder[1..]; continue; }, + // BOM handling + 0xFEFF => { + buffer.ensureTotalCapacity(allocator, 1) catch unreachable; + buffer.items.ptr[buffer.items.len] = remainder[0]; + buffer.items.len += 1; + remainder = remainder[1..]; + }, // Is this an unpaired low surrogate or four-digit hex escape? else => { @@ -629,8 +636,13 @@ pub const TextDecoder = struct { }, EncodingLabel.@"UTF-8" => { const toUTF16 = if (stream) strings.toUTF16Alloc else strings.toUTF16AllocNoTrim; + const moved_buffer_slice_8 = if (!this.ignore_bom and buffer_slice.len > 3 and std.mem.eql(u8, &[_]u8{ '\xEF', '\xBB', '\xBF' }, buffer_slice[0..3])) + buffer_slice[3..] + else + buffer_slice; + if (this.fatal) { - if (toUTF16(default_allocator, buffer_slice, true)) |result_| { + if (toUTF16(default_allocator, moved_buffer_slice_8, true)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -649,7 +661,7 @@ pub const TextDecoder = struct { } } } else { - if (toUTF16(default_allocator, buffer_slice, false)) |result_| { + if (toUTF16(default_allocator, moved_buffer_slice_8, false)) |result_| { if (result_) |result| { return ZigString.toExternalU16(result.ptr, result.len, globalThis); } @@ -664,15 +676,20 @@ pub const TextDecoder = struct { } // Experiment: using mimalloc directly is slightly slower - return ZigString.init(buffer_slice).toValueGC(globalThis); + return ZigString.init(moved_buffer_slice_8).toValueGC(globalThis); }, EncodingLabel.@"UTF-16LE" => { - if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) { - return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, buffer_slice))), globalThis); + const moved_buffer_slice_16 = if (!this.ignore_bom and buffer_slice.len > 2 and std.mem.eql(u8, &[_]u8{ '\xFF', '\xFE' }, buffer_slice[0..2])) + buffer_slice[2..] + else + buffer_slice; + + if (std.mem.isAligned(@intFromPtr(moved_buffer_slice_16.ptr), @alignOf([*]const u16))) { + return this.decodeUTF16WithAlignment([]align(2) const u16, @as([]align(2) const u16, @alignCast(std.mem.bytesAsSlice(u16, moved_buffer_slice_16))), globalThis); } - return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, buffer_slice), globalThis); + return this.decodeUTF16WithAlignment([]align(1) const u16, std.mem.bytesAsSlice(u16, moved_buffer_slice_16), globalThis); }, else => { globalThis.throwInvalidArguments("TextDecoder.decode set to unsupported encoding", .{}); diff --git a/test/js/web/encoding/text-decoder.test.js b/test/js/web/encoding/text-decoder.test.js index dabdb0936..de71fb351 100644 --- a/test/js/web/encoding/text-decoder.test.js +++ b/test/js/web/encoding/text-decoder.test.js @@ -250,7 +250,7 @@ describe("TextDecoder", () => { it("constructor should set values", () => { const decoder = new TextDecoder("utf-8", { fatal: true, ignoreBOM: false }); expect(decoder.fatal).toBe(true); - // expect(decoder.ignoreBOM).toBe(false); // currently the getter for ignoreBOM doesn't work and always returns undefined + expect(decoder.ignoreBOM).toBe(false); }); it("should throw on invalid input", () => { @@ -265,6 +265,29 @@ describe("TextDecoder", () => { }); }); +describe("TextDecoder ignoreBOM", () => { + + it.each([ + { + encoding: 'utf-8', + bytes: [0xEF, 0xBB, 0xBF, 0x61, 0x62, 0x63] + }, + { + encoding: 'utf-16le', + bytes: [0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00] + } + ])('should ignoreBOM for: %o', ({encoding, bytes}) => { + const BOM = '\uFEFF'; + const array = new Uint8Array(bytes); + + const decoder_ignore_bom = new TextDecoder(encoding, {ignoreBOM: true}); + expect(decoder_ignore_bom.decode(array)).toStrictEqual(`${BOM}abc`); + + const decoder_not_ignore_bom = new TextDecoder(encoding, {ignoreBOM: false}); + expect(decoder_not_ignore_bom.decode(array)).toStrictEqual('abc'); + }); +}); + it("truncated sequences", () => { const assert_equals = (a, b) => expect(a).toBe(b); |