aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/bun.js/base.zig3
-rw-r--r--src/string_immutable.zig253
2 files changed, 111 insertions, 145 deletions
diff --git a/src/bun.js/base.zig b/src/bun.js/base.zig
index 1271deb19..2a0e3d037 100644
--- a/src/bun.js/base.zig
+++ b/src/bun.js/base.zig
@@ -2155,7 +2155,7 @@ pub fn JSError(
if (comptime std.meta.fields(@TypeOf(args)).len == 0) {
var zig_str = JSC.ZigString.init(fmt);
- if (comptime !strings.isAllASCII(fmt)) {
+ if (comptime !strings.isAllASCIISimple(fmt)) {
zig_str.markUTF16();
}
@@ -2167,7 +2167,6 @@ pub fn JSError(
var buf = std.fmt.allocPrint(allocator, fmt, args) catch unreachable;
var zig_str = JSC.ZigString.init(buf);
zig_str.detectEncoding();
- zig_str.mark();
// it alwayas clones
exception.* = zig_str.toErrorInstance(ctx).asObjectRef();
allocator.free(buf);
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 9e4461d3a..d4dc7c6c1 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1017,144 +1017,110 @@ pub fn allocateLatin1IntoUTF8WithList(list_: std.ArrayList(u8), offset_into_list
var latin1 = latin1_;
var i: usize = offset_into_list;
var list = list_;
+ try list.ensureUnusedCapacity(latin1.len);
+
while (latin1.len > 0) {
- try list.ensureUnusedCapacity(latin1.len);
+ assert(i < list.capacity);
var buf = list.items.ptr[i..list.capacity];
inner: {
- if (latin1.len >= ascii_vector_size) {
- const start_ptr = @ptrToInt(buf.ptr);
- const start_ptr_latin1 = @ptrToInt(latin1.ptr);
- const end_ptr = @ptrToInt(latin1.ptr + latin1.len - (latin1.len % ascii_vector_size));
+ var count = latin1.len / ascii_vector_size;
+ while (count > 0) : (count -= 1) {
+ const vec: AsciiVector = latin1[0..ascii_vector_size].*;
- while (@ptrToInt(latin1.ptr) < end_ptr) {
- const vec: AsciiVector = latin1[0..ascii_vector_size].*;
+ if (@reduce(.Max, vec) > 127) {
+ const Int = u64;
+ const size = @sizeOf(Int);
- if (@reduce(.Max, vec) > 127) {
- const Int = u64;
- const size = @sizeOf(Int);
- buf.len -= @ptrToInt(buf.ptr) - start_ptr;
- latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
-
- // zig or LLVM doesn't do @ctz nicely with SIMD
- if (comptime ascii_vector_size >= 8) {
- {
- const bytes = @bitCast(Int, latin1[0..size].*);
- // https://dotat.at/@/2022-06-27-tolower-swar.html
- const mask = bytes & 0x8080808080808080;
-
- if (mask > 0) {
- const first_set_byte = @ctz(Int, mask) / 8;
- if (comptime Environment.allow_assert) {
- assert(latin1[first_set_byte] >= 127);
- var j: usize = 0;
- while (j < first_set_byte) : (j += 1) {
- assert(latin1[j] < 127);
- }
- }
-
- buf[0..size].* = @bitCast([size]u8, bytes);
- buf = buf[first_set_byte..];
- latin1 = latin1[first_set_byte..];
- break :inner;
+ // zig or LLVM doesn't do @ctz nicely with SIMD
+ if (comptime ascii_vector_size >= 8) {
+ {
+ const bytes = @bitCast(Int, latin1[0..size].*);
+ // https://dotat.at/@/2022-06-27-tolower-swar.html
+ const mask = bytes & 0x8080808080808080;
+
+ if (mask > 0) {
+ const first_set_byte = @ctz(Int, mask) / 8;
+ if (comptime Environment.allow_assert) {
+ assert(latin1[first_set_byte] >= 127);
}
buf[0..size].* = @bitCast([size]u8, bytes);
- latin1 = latin1[size..];
- buf = buf[size..];
+ buf = buf[first_set_byte..];
+ latin1 = latin1[first_set_byte..];
+ break :inner;
}
- if (comptime ascii_vector_size >= 16) {
- const bytes = @bitCast(Int, latin1[0..size].*);
- // https://dotat.at/@/2022-06-27-tolower-swar.html
- const mask = bytes & 0x8080808080808080;
-
- if (mask > 0) {
- const first_set_byte = @ctz(Int, mask) / 8;
- if (comptime Environment.allow_assert) {
- assert(latin1[first_set_byte] >= 127);
- var j: usize = 0;
- while (j < first_set_byte) : (j += 1) {
- assert(latin1[j] < 127);
- }
- }
-
- buf[0..size].* = @bitCast([size]u8, bytes);
- buf = buf[first_set_byte..];
- latin1 = latin1[first_set_byte..];
- break :inner;
+ buf[0..size].* = @bitCast([size]u8, bytes);
+ latin1 = latin1[size..];
+ buf = buf[size..];
+ }
+
+ if (comptime ascii_vector_size >= 16) {
+ const bytes = @bitCast(Int, latin1[0..size].*);
+ // https://dotat.at/@/2022-06-27-tolower-swar.html
+ const mask = bytes & 0x8080808080808080;
+
+ if (mask > 0) {
+ const first_set_byte = @ctz(Int, mask) / 8;
+ if (comptime Environment.allow_assert) {
+ assert(latin1[first_set_byte] >= 127);
}
+
+ buf[0..size].* = @bitCast([size]u8, bytes);
+ buf = buf[first_set_byte..];
+ latin1 = latin1[first_set_byte..];
+ break :inner;
}
}
- unreachable;
}
-
- buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
- latin1.ptr += ascii_vector_size;
- buf.ptr += ascii_vector_size;
+ unreachable;
}
- buf.len -= @ptrToInt(buf.ptr) - start_ptr;
- latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
+
+ buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
+ latin1 = latin1[ascii_vector_size..];
+ buf = buf[ascii_vector_size..];
}
- {
+ while (latin1.len >= 8) {
const Int = u64;
const size = @sizeOf(Int);
- const latin1_end_ptr = latin1.ptr + latin1.len - (latin1.len % size);
- const start_ptr = @ptrToInt(buf.ptr);
- const start_ptr_latin1 = @ptrToInt(latin1.ptr);
- while (latin1.ptr != latin1_end_ptr) {
- const bytes = @bitCast(Int, latin1[0..size].*);
- // https://dotat.at/@/2022-06-27-tolower-swar.html
- const mask = bytes & 0x8080808080808080;
- if (mask > 0) {
- const first_set_byte = @ctz(Int, mask) / 8;
- if (comptime Environment.allow_assert) {
- assert(latin1[first_set_byte] >= 127);
- var j: usize = 0;
- while (j < first_set_byte) : (j += 1) {
- assert(latin1[j] < 127);
- }
- }
+ const bytes = @bitCast(Int, latin1[0..size].*);
+ // https://dotat.at/@/2022-06-27-tolower-swar.html
+ const mask = bytes & 0x8080808080808080;
- buf[0..size].* = @bitCast([size]u8, bytes);
- buf.ptr += first_set_byte;
- latin1.ptr += first_set_byte;
- buf.len -= @ptrToInt(buf.ptr) - start_ptr;
- latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
- break :inner;
+ if (mask > 0) {
+ const first_set_byte = @ctz(Int, mask) / 8;
+ if (comptime Environment.allow_assert) {
+ assert(latin1[first_set_byte] >= 127);
}
buf[0..size].* = @bitCast([size]u8, bytes);
- latin1.ptr += size;
- buf.ptr += size;
+ latin1 = latin1[first_set_byte..];
+ buf = buf[first_set_byte..];
+ break :inner;
}
- buf.len -= @ptrToInt(buf.ptr) - start_ptr;
- latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
+
+ buf[0..size].* = @bitCast([size]u8, bytes);
+ latin1 = latin1[size..];
+ buf = buf[size..];
}
{
assert(latin1.len < 8);
const end = latin1.ptr + latin1.len;
- const start_ptr = @ptrToInt(buf.ptr);
- const start_ptr_latin1 = @ptrToInt(latin1.ptr);
-
- while (latin1.ptr != end and latin1.ptr[0] <= 127) {
- buf.ptr[0] = latin1.ptr[0];
- buf.ptr += 1;
- latin1.ptr += 1;
+ while (latin1.ptr != end and latin1[0] < 128) {
+ buf[0] = latin1[0];
+ buf = buf[1..];
+ latin1 = latin1[1..];
}
-
- buf.len -= @ptrToInt(buf.ptr) - start_ptr;
- latin1.len -= @ptrToInt(latin1.ptr) - start_ptr_latin1;
}
}
while (latin1.len > 0 and latin1[0] > 127) {
i = @ptrToInt(buf.ptr) - @ptrToInt(list.items.ptr);
list.items.len = i;
-
try list.ensureUnusedCapacity(2 + latin1.len);
buf = list.items.ptr[i..list.capacity];
buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]);
@@ -1286,7 +1252,8 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
var latin1 = latin1_;
while (buf.len > 0 and latin1.len > 0) {
inner: {
- while (@minimum(buf.len, latin1.len) >= ascii_vector_size) {
+ var remaining_runs = @minimum(buf.len, latin1.len) / ascii_vector_size;
+ while (remaining_runs > 0) : (remaining_runs -= 1) {
const vec: AsciiVector = latin1[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
@@ -1302,23 +1269,19 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
+ buf[0..size].* = @bitCast([size]u8, bytes);
+
if (mask > 0) {
const first_set_byte = @ctz(Int, mask) / 8;
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
- var j: usize = 0;
- while (j < first_set_byte) : (j += 1) {
- assert(latin1[j] < 127);
- }
}
- buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
- buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
@@ -1328,24 +1291,20 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
// https://dotat.at/@/2022-06-27-tolower-swar.html
const mask = bytes & 0x8080808080808080;
- if (mask > 0) {
- const first_set_byte = @ctz(Int, mask) / 8;
- if (comptime Environment.allow_assert) {
- assert(latin1[first_set_byte] >= 127);
- var j: usize = 0;
- while (j < first_set_byte) : (j += 1) {
- assert(latin1[j] < 127);
- }
- }
+ buf[0..size].* = @bitCast([size]u8, bytes);
- buf[0..size].* = @bitCast([size]u8, bytes);
- buf = buf[first_set_byte..];
- latin1 = latin1[first_set_byte..];
- break :inner;
+ assert(mask > 0);
+ const first_set_byte = @ctz(Int, mask) / 8;
+ if (comptime Environment.allow_assert) {
+ assert(latin1[first_set_byte] >= 127);
}
+
+ buf = buf[first_set_byte..];
+ latin1 = latin1[first_set_byte..];
+ break :inner;
}
}
- break;
+ unreachable;
}
buf[0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[0..ascii_vector_size].*;
@@ -1358,7 +1317,10 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
const size = @sizeOf(Int);
while (@minimum(buf.len, latin1.len) >= size) {
const bytes = @bitCast(Int, latin1[0..size].*);
+ buf[0..size].* = @bitCast([size]u8, bytes);
+
// https://dotat.at/@/2022-06-27-tolower-swar.html
+
const mask = bytes & 0x8080808080808080;
if (mask > 0) {
@@ -1367,20 +1329,14 @@ pub fn copyLatin1IntoUTF8StopOnNonASCII(buf_: []u8, comptime Type: type, latin1_
if (comptime Environment.allow_assert) {
assert(latin1[first_set_byte] >= 127);
- var j: usize = 0;
- while (j < first_set_byte) : (j += 1) {
- assert(latin1[j] < 127);
- }
}
- buf[0..size].* = @bitCast([size]u8, bytes);
buf = buf[first_set_byte..];
latin1 = latin1[first_set_byte..];
break :inner;
}
- buf[0..size].* = @bitCast([size]u8, bytes);
latin1 = latin1[size..];
buf = buf[size..];
}
@@ -1422,6 +1378,7 @@ pub fn replaceLatin1WithUTF8(buf_: []u8) void {
var latin1 = buf_;
while (strings.firstNonASCII(latin1)) |i| {
latin1[i..][0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[i]);
+
latin1 = latin1[i + 2 ..];
}
}
@@ -2464,31 +2421,32 @@ pub fn isAllASCII(slice: []const u8) bool {
// The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers
if (comptime Environment.isAarch64 or Environment.isX64) {
- while (remaining.len >= 128) {
- comptime var count: usize = 0;
- inline while (count < 8) : (count += 1) {
- const vec: AsciiVector = remaining[(comptime count * ascii_vector_size)..][0..ascii_vector_size].*;
-
- if (@reduce(.Max, vec) > 127) {
- return false;
- }
- }
- remaining = remaining[comptime ascii_vector_size * count..];
- }
-
- while (remaining.len >= ascii_vector_size) {
+ const remaining_end_ptr = remaining.ptr + remaining.len - (remaining.len % ascii_vector_size);
+ while (remaining.ptr != remaining_end_ptr) : (remaining.ptr += ascii_vector_size) {
const vec: AsciiVector = remaining[0..ascii_vector_size].*;
if (@reduce(.Max, vec) > 127) {
return false;
}
+ }
+ }
- remaining = remaining[ascii_vector_size..];
+ const Int = u64;
+ const size = @sizeOf(Int);
+ const remaining_last8 = slice.ptr + slice.len - (slice.len % size);
+ while (remaining.ptr != remaining_last8) : (remaining.ptr += size) {
+ const bytes = @bitCast(Int, remaining[0..size].*);
+ // https://dotat.at/@/2022-06-27-tolower-swar.html
+ const mask = bytes & 0x8080808080808080;
+
+ if (mask > 0) {
+ return false;
}
}
- for (remaining) |char| {
- if (char > 127) {
+ const final = slice.ptr + slice.len;
+ while (remaining.ptr != final) : (remaining.ptr += 1) {
+ if (remaining[0] > 127) {
return false;
}
}
@@ -2496,6 +2454,15 @@ pub fn isAllASCII(slice: []const u8) bool {
return true;
}
+pub fn isAllASCIISimple(comptime slice: []const u8) bool {
+ for (slice) |char| {
+ if (char > 127) {
+ return false;
+ }
+ }
+ return true;
+}
+
//#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
pub inline fn u16Lead(supplementary: anytype) u16 {
return @intCast(u16, (supplementary >> 10) + 0xd7c0);