diff options
author | 2022-06-29 04:29:01 -0700 | |
---|---|---|
committer | 2022-06-29 04:29:01 -0700 | |
commit | 7d5464adbeee3ac5b9ea0b9004f2a5b99f2f3032 (patch) | |
tree | 0c764a686b839923d90c231cd3d6666f6391a439 | |
parent | 95c17852717c0857fcbce37169980ea563082e39 (diff) | |
download | bun-7d5464adbeee3ac5b9ea0b9004f2a5b99f2f3032.tar.gz bun-7d5464adbeee3ac5b9ea0b9004f2a5b99f2f3032.tar.zst bun-7d5464adbeee3ac5b9ea0b9004f2a5b99f2f3032.zip |
[encoder] Fix non-ascii latin1 characters
-rw-r--r-- | src/string_immutable.zig | 419 | ||||
-rw-r--r-- | test/bun.js/text-encoder.test.js | 53 |
2 files changed, 396 insertions, 76 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig index d2691a804..9e4cf3b1c 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1010,77 +1010,168 @@ pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, } var list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len); + return (try allocateLatin1IntoUTF8WithList(list, 0, Type, latin1_)).toOwnedSlice(); +} + +pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list: usize, comptime Type: type, latin1_: Type) !std.ArrayList(u8) { var latin1 = latin1_; + var i: usize = offset_into_list; + var list = list_; while (latin1.len > 0) { - const read = @as(usize, firstNonASCII(latin1) orelse @intCast(u32, latin1.len)); - try list.ensureTotalCapacityPrecise( - list.items.len + read + if (read != latin1.len) @as(usize, 2) else @as(usize, 0), - ); - const before = list.items.len; - list.items.len += read; - @memcpy(list.items[before..].ptr, latin1.ptr, read); - latin1 = latin1[read..]; + try list.ensureUnusedCapacity(latin1.len); + // assert our starting capcaicty is at least latin1 + var buf = list.items.ptr[i..list.capacity]; - if (latin1.len > 0) { - try list.ensureUnusedCapacity(2); - var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2]; - list.items.len += 2; - buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); - latin1 = latin1[1..]; - } - } + inner: { + while (latin1.len >= ascii_vector_size) { + const vec: AsciiVector = latin1[0..ascii_vector_size].*; - return list.toOwnedSlice(); -} + if (@reduce(.Max, vec) > 127) { + const Int = u64; + const size = @sizeOf(Int); -pub fn allocateLatin1IntoUTF8ForArrayBuffer(allocator: std.mem.Allocator, globalThis: *JSC.JSGlobalObject, comptime Type: type, latin1_: Type) !JSC.JSValue { - if (comptime bun.FeatureFlags.latin1_is_now_ascii) { - var out = try allocator.alloc(u8, latin1_.len); - @memcpy(out.ptr, latin1_.ptr, latin1_.len); - return out; - } + // zig or LLVM doesn't do @ctz nicely with SIMD + if (comptime ascii_vector_size >= 8) { + { + const bytes = @bitCast(Int, latin1[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(latin1[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(latin1[j] < 127); + } + } - var latin1 = latin1_; + buf[0..size].* = @bitCast([size]u8, bytes); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } - if (firstNonASCII(latin1)) |start_i| { - var list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len + 2); - list.items.len = start_i; - @memcpy(list.items.ptr, latin1.ptr, start_i); - { - var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2]; - list.items.len += 2; - buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); - latin1 = latin1[1..]; - } + buf[0..size].* = @bitCast([size]u8, bytes); + latin1 = latin1[size..]; + buf = buf[size..]; + } - while (latin1.len > 0) { - const read = @as(usize, firstNonASCII(latin1) orelse @intCast(u32, latin1.len)); - try list.ensureTotalCapacityPrecise( - list.items.len + read + if (read != latin1.len) @as(usize, 2) else @as(usize, 0), - ); - const before = list.items.len; - list.items.len += read; - @memcpy(list.items[before..].ptr, latin1.ptr, read); - latin1 = latin1[read..]; - - if (latin1.len > 0) { - try list.ensureUnusedCapacity(2); - var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2]; - list.items.len += 2; - buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); + if (comptime ascii_vector_size >= 16) { + const bytes = @bitCast(Int, latin1[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(latin1[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(latin1[j] < 127); + } + } + + buf[0..size].* = @bitCast([size]u8, bytes); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + } + + unreachable; + } + } + + buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*; + latin1 = latin1[ascii_vector_size..]; + buf = buf[ascii_vector_size..]; + } + + { + const Int = u64; + const size = @sizeOf(Int); + while (latin1.len >= size) { + const bytes = @bitCast(Int, latin1[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(latin1[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(latin1[j] < 127); + } + } + + buf[0..size].* = @bitCast([size]u8, bytes); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + + buf[0..size].* = @bitCast([size]u8, bytes); + latin1 = latin1[size..]; + buf = buf[size..]; + } + } + + { + const Int = u32; + const size = @sizeOf(Int); + while (latin1.len >= size) { + const bytes = @bitCast(Int, latin1[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x80808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(latin1[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(latin1[j] < 127); + } + } + + buf[0..size].* = @bitCast([size]u8, bytes); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + + buf[0..size].* = @bitCast([size]u8, bytes); + latin1 = latin1[size..]; + buf = buf[size..]; + } + } + + while (latin1.len >= 1 and latin1[0] < 127) { + buf[0] = latin1[0]; latin1 = latin1[1..]; + buf = buf[1..]; } } - return JSC.ArrayBuffer.fromBytes(list.toOwnedSlice(), .Uint8Array).toJS(globalThis, null); - } + i = @ptrToInt(buf.ptr) - @ptrToInt(list.items.ptr); + list.items.len = i; - { - const array_buffer = JSC.JSValue.createUninitializedUint8Array(globalThis, latin1.len); - var bytes = array_buffer.asArrayBuffer(globalThis).?.slice(); - @memcpy(bytes.ptr, latin1.ptr, latin1.len); - return array_buffer; + while (latin1.len > 0 and latin1[0] >= 127) { + try list.ensureUnusedCapacity(2 + latin1.len); + buf = list.items.ptr[i..list.capacity]; + buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); + latin1 = latin1[1..]; + buf = buf[2..]; + + i = @ptrToInt(buf.ptr) - @ptrToInt(list.items.ptr); + list.items.len = i; + } } + + return list; } pub const UTF16Replacement = struct { @@ -1186,6 +1277,10 @@ pub fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement { } pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult { + return copyLatin1IntoUTF8StopOnNonASCII(buf_, Type, latin1_, false); +} + +pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_: Type, comptime stop: bool) EncodeIntoResult { if (comptime bun.FeatureFlags.latin1_is_now_ascii) { const to_copy = @truncate(u32, @minimum(buf_.len, latin1_.len)); @memcpy(buf_.ptr, latin1_.ptr, to_copy); @@ -1195,28 +1290,94 @@ pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) Encode var buf = buf_; var latin1 = latin1_; while (buf.len > 0 and latin1.len > 0) { - var read: usize = 0; + inner: { + while (@minimum(buf.len, latin1.len) >= ascii_vector_size) { + const vec: AsciiVector = latin1[0..ascii_vector_size].*; - while (latin1.len > ascii_vector_size) { - const vec: AsciiVector = latin1[0..ascii_vector_size].*; + if (@reduce(.Max, vec) > 127) { + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + break; + } - if (@reduce(.Max, vec) > 127) { - break; + buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*; + latin1 = latin1[ascii_vector_size..]; + buf = buf[ascii_vector_size..]; } - buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*; - latin1 = latin1[ascii_vector_size..]; - buf = buf[ascii_vector_size..]; - } + { + const Int = u64; + const size = @sizeOf(Int); + while (@minimum(buf.len, latin1.len) >= size) { + const bytes = @bitCast(Int, latin1[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + + if (comptime Environment.allow_assert) { + assert(latin1[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(latin1[j] < 127); + } + } - while (read < latin1.len and latin1[read] < 0x80) : (read += 1) {} + buf[0..size].* = @bitCast([size]u8, bytes); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; - const to_copy = @minimum(read, buf.len); - @memcpy(buf.ptr, latin1.ptr, to_copy); - latin1 = latin1[to_copy..]; - buf = buf[to_copy..]; + break :inner; + } + + buf[0..size].* = @bitCast([size]u8, bytes); + latin1 = latin1[size..]; + buf = buf[size..]; + } + } + + { + const Int = u32; + const size = @sizeOf(Int); + while (@minimum(buf.len, latin1.len) >= size) { + const bytes = @bitCast(Int, latin1[0..size].*); + const mask = bytes & 0x80808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + + if (comptime Environment.allow_assert) { + assert(latin1[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(latin1[j] < 127); + } + } + + buf[0..size].* = @bitCast([size]u8, bytes); + buf = buf[first_set_byte..]; + latin1 = latin1[first_set_byte..]; + break :inner; + } + + buf[0..size].* = @bitCast([size]u8, bytes); + latin1 = latin1[size..]; + buf = buf[size..]; + } + } + + while (@minimum(buf.len, latin1.len) >= 1 and latin1[0] < 127) { + buf[0] = latin1[0]; + latin1 = latin1[1..]; + buf = buf[1..]; + } + } if (latin1.len > 0 and buf.len >= 2) { + if (comptime stop) return .{ .written = std.math.maxInt(u32), .read = std.math.maxInt(u32) }; + buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); latin1 = latin1[1..]; buf = buf[2..]; @@ -1955,7 +2116,7 @@ pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) } } -test "copyLatin1IntoUTF8" { +test "copyLatin1IntoUTF8 - ascii" { var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!"; var output = std.mem.zeroes([500]u8); const result = copyLatin1IntoUTF8(&output, string, input); @@ -1965,6 +2126,28 @@ test "copyLatin1IntoUTF8" { try std.testing.expectEqualSlices(u8, input, output[0..result.written]); } +test "copyLatin1IntoUTF8 - latin1" { + { + var input: string = &[_]u8{ 104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 32, 169 }; + var output = std.mem.zeroes([500]u8); + var expected = "hello world ©"; + const result = copyLatin1IntoUTF8(&output, string, input); + try std.testing.expectEqual(input.len, result.read); + + try std.testing.expectEqualSlices(u8, expected, output[0..result.written]); + } + + { + var input: string = &[_]u8{ 72, 169, 101, 108, 108, 169, 111, 32, 87, 111, 114, 169, 108, 100, 33 }; + var output = std.mem.zeroes([500]u8); + var expected = "H©ell©o Wor©ld!"; + const result = copyLatin1IntoUTF8(&output, string, input); + try std.testing.expectEqual(input.len, result.read); + + try std.testing.expectEqualSlices(u8, expected, output[0..result.written]); + } +} + pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType { return @intCast( CodePointType, @@ -1976,6 +2159,7 @@ pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) C } pub fn latin1ToCodepointBytesAssumeNotASCIIWIthCharType(comptime Char: type, char: u32) [2]Char { + assert(char > 127); return [2]Char{ @as(Char, @truncate(u8, 0xc0 | char >> 6)), @as(Char, @truncate(u8, 0x80 | (char & 0x3f))), @@ -2299,16 +2483,99 @@ pub fn firstNonASCIIWithType(comptime Type: type, slice: Type) ?u32 { const vec: AsciiVector = remaining[0..ascii_vector_size].*; if (@reduce(.Max, vec) > 127) { - const cmp = vec > max_16_ascii; - const bitmask = @ptrCast(*const AsciiVectorInt, &cmp).*; - const first = @ctz(AsciiVectorInt, bitmask); - return @as(u32, first) + @intCast(u32, slice.len - remaining.len); + const Int = u64; + const size = @sizeOf(Int); + { + const bytes = @bitCast(Int, remaining[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(remaining[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(remaining[j] < 127); + } + } + + return @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len); + } + } + { + const bytes = @bitCast(Int, remaining[size..][0..size].*); + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(remaining[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(remaining[j] < 127); + } + } + + return 8 + @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len); + } + } + break; } remaining = remaining[ascii_vector_size..]; } } + { + const Int = u64; + const size = @sizeOf(Int); + while (remaining.len >= size) { + const bytes = @bitCast(Int, remaining[0..size].*); + // https://dotat.at/@/2022-06-27-tolower-swar.html + const mask = bytes & 0x8080808080808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(remaining[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(remaining[j] < 127); + } + } + + return @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len); + } + + remaining = remaining[size..]; + } + } + + { + const Int = u32; + const size = @sizeOf(Int); + while (remaining.len >= size) { + const bytes = @bitCast(Int, remaining[0..size].*); + const mask = bytes & 0x80808080; + + if (mask > 0) { + const first_set_byte = @ctz(Int, mask) / 8; + if (comptime Environment.allow_assert) { + assert(remaining[first_set_byte] >= 127); + var j: usize = 0; + while (j < first_set_byte) : (j += 1) { + assert(remaining[j] < 127); + } + } + + return @as(u32, first_set_byte) + @intCast(u32, slice.len - remaining.len); + } + + remaining = remaining[size..]; + } + } + for (remaining) |char, i| { if (char > 127) { return @truncate(u32, i + (slice.len - remaining.len)); diff --git a/test/bun.js/text-encoder.test.js b/test/bun.js/text-encoder.test.js index 5687e0222..5f8778bde 100644 --- a/test/bun.js/text-encoder.test.js +++ b/test/bun.js/text-encoder.test.js @@ -69,6 +69,31 @@ describe("TextDecoder", () => { }); describe("TextEncoder", () => { + it("should encode latin1 text with non-ascii latin1 characters", () => { + var text = "H©ell©o Wor©ld!"; + + gcTrace(true); + const encoder = new TextEncoder(); + const encoded = encoder.encode(text); + gcTrace(true); + const into = new Uint8Array(100); + const out = encoder.encodeInto(text, into); + gcTrace(true); + expect(out.read).toBe(text.length); + + expect(encoded instanceof Uint8Array).toBe(true); + const result = [ + 72, 194, 169, 101, 108, 108, 194, 169, 111, 32, 87, 111, 114, 194, 169, + 108, 100, 33, + ]; + for (let i = 0; i < result.length; i++) { + expect(encoded[i]).toBe(result[i]); + expect(into[i]).toBe(result[i]); + } + expect(encoded.length).toBe(result.length); + expect(out.written).toBe(result.length); + }); + it("should encode latin1 text", () => { gcTrace(true); const text = "Hello World!"; @@ -126,6 +151,34 @@ describe("TextEncoder", () => { expect(encoded.length).toBe(getByteLength(text)); }); + it("should encode latin1 rope text with non-ascii latin1 characters", () => { + var text = "H©ell©o"; + text += " "; + text += "Wor©ld!"; + + gcTrace(true); + const encoder = new TextEncoder(); + const encoded = encoder.encode(text); + gcTrace(true); + const into = new Uint8Array(100); + const out = encoder.encodeInto(text, into); + gcTrace(true); + expect(out.read).toBe(text.length); + + expect(encoded instanceof Uint8Array).toBe(true); + const result = [ + 72, 194, 169, 101, 108, 108, 194, 169, 111, 32, 87, 111, 114, 194, 169, + 108, 100, 33, + ]; + + for (let i = 0; i < result.length; i++) { + expect(encoded[i]).toBe(into[i]); + expect(encoded[i]).toBe(result[i]); + } + expect(encoded.length).toBe(result.length); + expect(out.written).toBe(encoded.length); + }); + it("should encode utf-16 text", () => { var text = `❤️ Red Heart ✨ Sparkles |