diff options
Diffstat (limited to 'src/string_immutable.zig')
-rw-r--r-- | src/string_immutable.zig | 160 |
1 files changed, 120 insertions, 40 deletions
diff --git a/src/string_immutable.zig b/src/string_immutable.zig index f4c6fae07..889add550 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -20,7 +20,7 @@ pub inline fn containsChar(self: string, char: u8) bool { } pub inline fn contains(self: string, str: string) bool { - return std.mem.indexOf(u8, self, str) != null; + return indexOf(self, str) != null; } pub fn toUTF16Literal(comptime str: []const u8) []const u16 { @@ -41,11 +41,9 @@ pub fn toUTF16Literal(comptime str: []const u8) []const u16 { const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1); pub fn indexOfAny(self: string, comptime str: anytype) ?OptionalUsize { - for (self, 0..) |c, i| { - inline for (str) |a| { - if (c == a) { - return @intCast(OptionalUsize, i); - } + inline for (str) |a| { + if (indexOfChar(self, a)) |i| { + return @intCast(OptionalUsize, i); } } @@ -148,6 +146,79 @@ pub fn indexOfCharNeg(self: string, char: u8) i32 { return -1; } +/// Format a string to an ECMAScript identifier. +/// Unlike the string_mutable.zig version, this always allocate/copy +pub fn fmtIdentifier(name: string) FormatValidIdentifier { + return FormatValidIdentifier{ .name = name }; +} + +/// Format a string to an ECMAScript identifier. +/// Different implementation than string_mutable because string_mutable may avoid allocating +/// This will always allocate +pub const FormatValidIdentifier = struct { + name: string, + const js_lexer = @import("./js_lexer.zig"); + pub fn format(self: FormatValidIdentifier, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + var iterator = strings.CodepointIterator.init(self.name); + var cursor = strings.CodepointIterator.Cursor{}; + + var has_needed_gap = false; + var needs_gap = false; + var start_i: usize = 0; + + if (!iterator.next(&cursor)) { + try writer.writeAll("_"); + return; + } + + // Common case: no gap necessary. No allocation necessary. + needs_gap = !js_lexer.isIdentifierStart(cursor.c); + if (!needs_gap) { + // Are there any non-alphanumeric chars at all? + while (iterator.next(&cursor)) { + if (!js_lexer.isIdentifierContinue(cursor.c) or cursor.width > 1) { + needs_gap = true; + start_i = cursor.i; + break; + } + } + } + + if (needs_gap) { + needs_gap = false; + if (start_i > 0) try writer.writeAll(self.name[0..start_i]); + var slice = self.name[start_i..]; + iterator = strings.CodepointIterator.init(slice); + cursor = strings.CodepointIterator.Cursor{}; + + while (iterator.next(&cursor)) { + if (js_lexer.isIdentifierContinue(cursor.c) and cursor.width == 1) { + if (needs_gap) { + try writer.writeAll("_"); + needs_gap = false; + has_needed_gap = true; + } + try writer.writeAll(slice[cursor.i .. cursor.i + @as(u32, cursor.width)]); + } else if (!needs_gap) { + needs_gap = true; + // skip the code point, replace it with a single _ + } + } + + // If it ends with an emoji + if (needs_gap) { + try writer.writeAll("_"); + needs_gap = false; + has_needed_gap = true; + } + + return; + } + + try writer.writeAll(self.name); + } +}; + pub fn indexOfSigned(self: string, str: string) i32 { const i = std.mem.indexOf(u8, self, str) orelse return -1; return @intCast(i32, i); @@ -177,7 +248,9 @@ pub inline fn indexOf(self: string, str: string) ?usize { const start = bun.C.memmem(self_ptr, self_len, str_ptr, str_len) orelse return null; - return @ptrToInt(start) - @ptrToInt(self_ptr); + const i = @ptrToInt(start) - @ptrToInt(self_ptr); + std.debug.assert(i < self_len); + return @intCast(usize, i); } pub fn split(self: string, delimiter: string) SplitIterator { @@ -2899,12 +2972,12 @@ pub const max_16_ascii = @splat(ascii_vector_size, @as(u8, 127)); pub const min_16_ascii = @splat(ascii_vector_size, @as(u8, 0x20)); pub const max_u16_ascii = @splat(ascii_u16_vector_size, @as(u16, 127)); pub const min_u16_ascii = @splat(ascii_u16_vector_size, @as(u16, 0x20)); -pub const AsciiVector = std.meta.Vector(ascii_vector_size, u8); -pub const AsciiVectorSmall = std.meta.Vector(8, u8); -pub const AsciiVectorU1 = std.meta.Vector(ascii_vector_size, u1); -pub const AsciiVectorU1Small = std.meta.Vector(8, u1); -pub const AsciiVectorU16U1 = std.meta.Vector(ascii_u16_vector_size, u1); -pub const AsciiU16Vector = std.meta.Vector(ascii_u16_vector_size, u16); +pub const AsciiVector = @Vector(ascii_vector_size, u8); +pub const AsciiVectorSmall = @Vector(8, u8); +pub const AsciiVectorU1 = @Vector(ascii_vector_size, u1); +pub const AsciiVectorU1Small = @Vector(8, u1); +pub const AsciiVectorU16U1 = @Vector(ascii_u16_vector_size, u1); +pub const AsciiU16Vector = @Vector(ascii_u16_vector_size, u16); pub const max_4_ascii = @splat(4, @as(u8, 127)); pub fn isAllASCII(slice: []const u8) bool { if (bun.FeatureFlags.use_simdutf) @@ -3200,34 +3273,15 @@ pub fn indexOfCharZ(sliceZ: [:0]const u8, char: u8) ?u63 { } pub fn indexOfChar(slice: []const u8, char: u8) ?u32 { - var remaining = slice; - if (remaining.len == 0) + if (slice.len == 0) return null; - if (remaining[0] == char) - return 0; - - if (comptime Environment.enableSIMD) { - while (remaining.len >= ascii_vector_size) { - const vec: AsciiVector = remaining[0..ascii_vector_size].*; - const cmp = vec == @splat(ascii_vector_size, char); - - if (@reduce(.Max, @bitCast(AsciiVectorU1, cmp)) > 0) { - const bitmask = @bitCast(AsciiVectorInt, cmp); - const first = @ctz(bitmask); - return @intCast(u32, @as(u32, first) + @intCast(u32, slice.len - remaining.len)); - } - remaining = remaining[ascii_vector_size..]; - } - } - - for (remaining, 0..) |c, i| { - if (c == char) { - return @truncate(u32, i + (slice.len - remaining.len)); - } - } + const ptr = bun.C.memchr(slice.ptr, char, slice.len) orelse return null; + const i = @ptrToInt(ptr) - @ptrToInt(slice.ptr); + std.debug.assert(i < slice.len); + std.debug.assert(slice[i] == char); - return null; + return @truncate(u32, i); } test "indexOfChar" { @@ -3829,25 +3883,51 @@ pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Alloca return try std.mem.join(allocator, delimiter, slices); } +pub fn order(a: []const u8, b: []const u8) std.math.Order { + const len = @min(a.len, b.len); + const cmp = bun.C.memcmp(a.ptr, b.ptr, len); + return switch (std.math.sign(cmp)) { + 0 => std.math.order(a.len, b.len), + 1 => .gt, + -1 => .lt, + else => unreachable, + }; +} + pub fn cmpStringsAsc(_: void, a: string, b: string) bool { - return std.mem.order(u8, a, b) == .lt; + return order(a, b) == .lt; } pub fn cmpStringsDesc(_: void, a: string, b: string) bool { - return std.mem.order(u8, a, b) == .gt; + return order(a, b) == .gt; } const sort_asc = std.sort.asc(u8); const sort_desc = std.sort.desc(u8); pub fn sortAsc(in: []string) void { + // TODO: experiment with simd to see if it's faster std.sort.sort([]const u8, in, {}, cmpStringsAsc); } pub fn sortDesc(in: []string) void { + // TODO: experiment with simd to see if it's faster std.sort.sort([]const u8, in, {}, cmpStringsDesc); } +pub const StringArrayByIndexSorter = struct { + keys: []const []const u8, + pub fn lessThan(sorter: *const @This(), a: usize, b: usize) bool { + return strings.order(sorter.keys[a], sorter.keys[b]) == .lt; + } + + pub fn init(keys: []const []const u8) @This() { + return .{ + .keys = keys, + }; + } +}; + pub fn isASCIIHexDigit(c: u8) bool { return std.ascii.isHex(c); } |