const std = @import("std"); const expect = std.testing.expect; const Environment = @import("./env.zig"); const string = @import("string_types.zig").string; const stringZ = @import("string_types.zig").stringZ; const CodePoint = @import("string_types.zig").CodePoint; const assert = std.debug.assert; pub inline fn containsChar(self: string, char: u8) bool { return indexOfChar(self, char) != null; } pub inline fn contains(self: string, str: string) bool { return std.mem.indexOf(u8, self, str) != null; } pub const includes = contains; pub inline fn containsAny(in: anytype, target: string) bool { for (in) |str| if (contains(str, target)) return true; return false; } pub inline fn indexAny(in: anytype, target: string) ?usize { for (in) |str, i| if (indexOf(str, target) != null) return i; return null; } pub inline fn indexAnyComptime(target: string, comptime chars: string) ?usize { for (target) |parent, i| { inline for (chars) |char| { if (char == parent) return i; } } return null; } pub inline fn indexOfChar(self: string, char: u8) ?usize { return std.mem.indexOfScalar(@TypeOf(char), self, char); } pub fn indexOfCharNeg(self: string, char: u8) i32 { var i: u32 = 0; while (i < self.len) : (i += 1) { if (self[i] == char) return @intCast(i32, i); } return -1; } pub fn indexOfSigned(self: string, str: string) i32 { const i = std.mem.indexOf(u8, self, str) orelse return -1; return @intCast(i32, i); } pub inline fn lastIndexOfChar(self: string, char: u8) ?usize { return std.mem.lastIndexOfScalar(u8, self, char); } pub inline fn lastIndexOf(self: string, str: string) ?usize { return std.mem.lastIndexOf(u8, self, str); } pub inline fn indexOf(self: string, str: string) ?usize { return std.mem.indexOf(u8, self, str); } pub fn cat(allocator: std.mem.Allocator, first: string, second: string) !string { var out = try allocator.alloc(u8, first.len + second.len); std.mem.copy(u8, out, first); std.mem.copy(u8, out[first.len..], second); return out; } // 30 character string or a slice pub const StringOrTinyString = struct { pub const Max = 30; const Buffer = [Max]u8; remainder_buf: Buffer = undefined, remainder_len: u7 = 0, is_tiny_string: u1 = 0, pub inline fn slice(this: *const StringOrTinyString) []const u8 { // This is a switch expression instead of a statement to make sure it uses the faster assembly return switch (this.is_tiny_string) { 1 => this.remainder_buf[0..this.remainder_len], 0 => @intToPtr([*]const u8, std.mem.readIntNative(usize, this.remainder_buf[0..@sizeOf(usize)]))[0..std.mem.readIntNative(usize, this.remainder_buf[@sizeOf(usize) .. @sizeOf(usize) * 2])], }; } pub fn deinit(this: *StringOrTinyString, _: std.mem.Allocator) void { if (this.is_tiny_string == 1) return; // var slice_ = this.slice(); // allocator.free(slice_); } pub fn initAppendIfNeeded(stringy: string, comptime Appender: type, appendy: Appender) !StringOrTinyString { if (stringy.len <= StringOrTinyString.Max) { return StringOrTinyString.init(stringy); } return StringOrTinyString.init(try appendy.append(string, stringy)); } pub fn init(stringy: string) StringOrTinyString { switch (stringy.len) { 0 => { return StringOrTinyString{ .is_tiny_string = 1, .remainder_len = 0 }; }, 1...(@sizeOf(Buffer)) => { @setRuntimeSafety(false); var tiny = StringOrTinyString{ .is_tiny_string = 1, .remainder_len = @truncate(u7, stringy.len), }; @memcpy(&tiny.remainder_buf, stringy.ptr, tiny.remainder_len); return tiny; }, else => { var tiny = StringOrTinyString{ .is_tiny_string = 0, .remainder_len = 0, }; std.mem.writeIntNative(usize, tiny.remainder_buf[0..@sizeOf(usize)], @ptrToInt(stringy.ptr)); std.mem.writeIntNative(usize, tiny.remainder_buf[@sizeOf(usize) .. @sizeOf(usize) * 2], stringy.len); return tiny; }, } } pub fn initLowerCase(stringy: string) StringOrTinyString { switch (stringy.len) { 0 => { return StringOrTinyString{ .is_tiny_string = 1, .remainder_len = 0 }; }, 1...(@sizeOf(Buffer)) => { @setRuntimeSafety(false); var tiny = StringOrTinyString{ .is_tiny_string = 1, .remainder_len = @truncate(u7, stringy.len), }; _ = copyLowercase(stringy, &tiny.remainder_buf); return tiny; }, else => { var tiny = StringOrTinyString{ .is_tiny_string = 0, .remainder_len = 0, }; std.mem.writeIntNative(usize, tiny.remainder_buf[0..@sizeOf(usize)], @ptrToInt(stringy.ptr)); std.mem.writeIntNative(usize, tiny.remainder_buf[@sizeOf(usize) .. @sizeOf(usize) * 2], stringy.len); return tiny; }, } } }; pub fn copyLowercase(in: string, out: []u8) string { @setRuntimeSafety(false); var in_slice: string = in; var out_slice: []u8 = out[0..in.len]; begin: while (out_slice.len > 0) { @setRuntimeSafety(false); for (in_slice) |c, i| { @setRuntimeSafety(false); switch (c) { 'A'...'Z' => { @setRuntimeSafety(false); @memcpy(out_slice.ptr, in_slice.ptr, i); out_slice[i] = std.ascii.toLower(c); const end = i + 1; if (end >= out_slice.len) break :begin; in_slice = in_slice[end..]; out_slice = out_slice[end..]; continue :begin; }, else => {}, } } @memcpy(out_slice.ptr, in_slice.ptr, in_slice.len); break :begin; } return out[0..in.len]; } test "eqlComptimeCheckLen" { try std.testing.expectEqual(eqlComptime("bun-darwin-aarch64.zip", "bun-darwin-aarch64.zip"), true); const sizes = [_]u8{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23, 22, 24 }; inline for (sizes) |size| { var buf: [size]u8 = undefined; std.mem.set(u8, &buf, 'a'); var buf_copy: [size]u8 = undefined; std.mem.set(u8, &buf_copy, 'a'); var bad: [size]u8 = undefined; std.mem.set(u8, &bad, 'b'); try std.testing.expectEqual(std.mem.eql(u8, &buf, &buf_copy), eqlComptime(&buf, comptime brk: { var buf_copy_: [size]u8 = undefined; std.mem.set(u8, &buf_copy_, 'a'); break :brk buf_copy_; })); try std.testing.expectEqual(std.mem.eql(u8, &buf, &bad), eqlComptime(&bad, comptime brk: { var buf_copy_: [size]u8 = undefined; std.mem.set(u8, &buf_copy_, 'a'); break :brk buf_copy_; })); } } test "eqlComptimeUTF16" { try std.testing.expectEqual(eqlComptimeUTF16(std.unicode.utf8ToUtf16LeStringLiteral("bun-darwin-aarch64.zip"), "bun-darwin-aarch64.zip"), true); const sizes = [_]u16{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23, 22, 24 }; inline for (sizes) |size| { var buf: [size]u16 = undefined; std.mem.set(u16, &buf, @as(u8, 'a')); var buf_copy: [size]u16 = undefined; std.mem.set(u16, &buf_copy, @as(u8, 'a')); var bad: [size]u16 = undefined; std.mem.set(u16, &bad, @as(u16, 'b')); try std.testing.expectEqual(std.mem.eql(u16, &buf, &buf_copy), eqlComptimeUTF16(&buf, comptime &brk: { var buf_copy_: [size]u8 = undefined; std.mem.set(u8, &buf_copy_, @as(u8, 'a')); break :brk buf_copy_; })); try std.testing.expectEqual(std.mem.eql(u16, &buf, &bad), eqlComptimeUTF16(&bad, comptime &brk: { var buf_copy_: [size]u8 = undefined; std.mem.set(u8, &buf_copy_, @as(u8, 'a')); break :brk buf_copy_; })); } } test "copyLowercase" { { var in = "Hello, World!"; var out = std.mem.zeroes([in.len]u8); var out_ = copyLowercase(in, &out); try std.testing.expectEqualStrings(out_, "hello, world!"); } { var in = "_ListCache"; var out = std.mem.zeroes([in.len]u8); var out_ = copyLowercase(in, &out); try std.testing.expectEqualStrings(out_, "_listcache"); } } test "StringOrTinyString" { const correct: string = "helloooooooo"; const big = "wawaweewaverylargeihaveachairwawaweewaverylargeihaveachairwawaweewaverylargeihaveachairwawaweewaverylargeihaveachair"; var str = StringOrTinyString.init(correct); try std.testing.expectEqualStrings(correct, str.slice()); str = StringOrTinyString.init(big); try std.testing.expectEqualStrings(big, str.slice()); try std.testing.expect(@sizeOf(StringOrTinyString) == 32); } test "StringOrTinyString Lowercase" { const correct: string = "HELLO!!!!!"; var str = StringOrTinyString.initLowerCase(correct); try std.testing.expectEqualStrings("hello!!!!!", str.slice()); } /// startsWith except it checks for non-empty strings pub fn hasPrefix(self: string, str: string) bool { return str.len > 0 and startsWith(self, str); } pub fn startsWith(self: string, str: string) bool { if (str.len > self.len) { return false; } var i: usize = 0; while (i < str.len) { if (str[i] != self[i]) { return false; } i += 1; } return true; } pub inline fn endsWith(self: string, str: string) bool { return str.len == 0 or @call(.{ .modifier = .always_inline }, std.mem.endsWith, .{ u8, self, str }); } pub inline fn endsWithComptime(self: string, comptime str: anytype) bool { return self.len >= str.len and eqlComptimeIgnoreLen(self[self.len - str.len .. self.len], comptime str); } pub inline fn startsWithChar(self: string, char: u8) bool { return self.len > 0 and self[0] == char; } pub inline fn endsWithChar(self: string, char: u8) bool { return self.len == 0 or self[self.len - 1] == char; } pub fn withoutTrailingSlash(this: string) []const u8 { var href = this; while (href.len > 1 and href[href.len - 1] == '/') { href = href[0 .. href.len - 1]; } return href; } pub fn withoutLeadingSlash(this: string) []const u8 { return std.mem.trimLeft(u8, this, "/"); } pub fn endsWithAny(self: string, str: string) bool { const end = self[self.len - 1]; for (str) |char| { if (char == end) { return true; } } return false; } pub fn quotedAlloc(allocator: std.mem.Allocator, self: string) !string { var count: usize = 0; for (self) |char| { count += @boolToInt(char == '"'); } if (count == 0) { return allocator.dupe(u8, self); } var i: usize = 0; var out = try allocator.alloc(u8, self.len + count); for (self) |char| { if (char == '"') { out[i] = '\\'; i += 1; } out[i] = char; i += 1; } return out; } pub fn eqlAnyComptime(self: string, comptime list: []const string) bool { inline for (list) |item| { if (eqlComptimeCheckLenWithType(u8, self, item, true)) return true; } return false; } pub fn endsWithAnyComptime(self: string, comptime str: string) bool { if (comptime str.len < 10) { const last = self[self.len - 1]; inline for (str) |char| { if (char == last) { return true; } } return false; } else { return endsWithAny(self, str); } } pub fn eql(self: string, other: anytype) bool { if (self.len != other.len) return false; if (comptime @TypeOf(other) == *string) { return eql(self, other.*); } for (self) |c, i| { if (other[i] != c) return false; } return true; } pub inline fn eqlInsensitive(self: string, other: anytype) bool { return std.ascii.eqlIgnoreCase(self, other); } pub fn eqlComptime(self: string, comptime alt: anytype) bool { return eqlComptimeCheckLenWithType(u8, self, alt, true); } pub fn eqlComptimeUTF16(self: []const u16, comptime alt: []const u8) bool { return eqlComptimeCheckLenWithType(u16, self, comptime std.unicode.utf8ToUtf16LeStringLiteral(alt), true); } pub fn eqlComptimeIgnoreLen(self: string, comptime alt: anytype) bool { return eqlComptimeCheckLenWithType(u8, self, alt, false); } inline fn eqlComptimeCheckLenWithType(comptime Type: type, a: []const Type, comptime b: anytype, comptime check_len: bool) bool { @setEvalBranchQuota(9999); if (comptime check_len) { if (comptime b.len == 0) { return a.len == 0; } switch (a.len) { b.len => {}, else => return false, } } const len = comptime b.len; comptime var dword_length = b.len >> 3; const slice = comptime if (@typeInfo(@TypeOf(b)) != .Pointer) b else std.mem.span(b); const divisor = comptime @sizeOf(Type); comptime var b_ptr: usize = 0; inline while (dword_length > 0) : (dword_length -= 1) { if (@bitCast(usize, a[b_ptr..][0 .. @sizeOf(usize) / divisor].*) != comptime @bitCast(usize, (slice[b_ptr..])[0 .. @sizeOf(usize) / divisor].*)) return false; comptime b_ptr += @sizeOf(usize); if (comptime b_ptr == b.len) return true; } if (comptime @sizeOf(usize) == 8) { if (comptime (len & 4) != 0) { if (@bitCast(u32, a[b_ptr..][0 .. @sizeOf(u32) / divisor].*) != comptime @bitCast(u32, (slice[b_ptr..])[0 .. @sizeOf(u32) / divisor].*)) return false; comptime b_ptr += @sizeOf(u32); if (comptime b_ptr == b.len) return true; } } if (comptime (len & 2) != 0) { if (@bitCast(u16, a[b_ptr..][0 .. @sizeOf(u16) / divisor].*) != comptime @bitCast(u16, slice[b_ptr .. b_ptr + (@sizeOf(u16) / divisor)].*)) return false; comptime b_ptr += @sizeOf(u16); if (comptime b_ptr == b.len) return true; } if ((comptime (len & 1) != 0) and a[b_ptr] != comptime b[b_ptr]) return false; return true; } pub fn eqlCaseInsensitiveASCII(a: string, comptime b: anytype, comptime check_len: bool) bool { if (comptime check_len) { if (comptime b.len == 0) { return a.len == 0; } switch (a.len) { b.len => void{}, else => return false, } } // pray to the auto vectorization gods inline for (b) |c, i| { const char = comptime std.ascii.toLower(c); if (char != std.ascii.toLower(a[i])) return false; } return true; } pub fn eqlLong(a_: string, b: string, comptime check_len: bool) bool { if (comptime check_len) { if (a_.len == 0) { return b.len == 0; } if (a_.len != b.len) { return false; } } const len = b.len; var dword_length = b.len >> 3; var b_ptr: usize = 0; const a = a_.ptr; while (dword_length > 0) : (dword_length -= 1) { const slice = b.ptr; if (@bitCast(usize, a[b_ptr..len][0..@sizeOf(usize)].*) != @bitCast(usize, (slice[b_ptr..b.len])[0..@sizeOf(usize)].*)) return false; b_ptr += @sizeOf(usize); if (b_ptr == b.len) return true; } if (comptime @sizeOf(usize) == 8) { if ((len & 4) != 0) { const slice = b.ptr; if (@bitCast(u32, a[b_ptr..len][0..@sizeOf(u32)].*) != @bitCast(u32, (slice[b_ptr..b.len])[0..@sizeOf(u32)].*)) return false; b_ptr += @sizeOf(u32); if (b_ptr == b.len) return true; } } if ((len & 2) != 0) { if (@bitCast(u16, a[b_ptr..len][0..@sizeOf(u16)].*) != @bitCast(u16, b.ptr[b_ptr..len][0..@sizeOf(u16)].*)) return false; b_ptr += @sizeOf(u16); if (b_ptr == b.len) return true; } if (((len & 1) != 0) and a[b_ptr] != b[b_ptr]) return false; return true; } pub inline fn append(allocator: std.mem.Allocator, self: string, other: string) !string { return std.fmt.allocPrint(allocator, "{s}{s}", .{ self, other }); } pub inline fn joinBuf(out: []u8, parts: anytype, comptime parts_len: usize) []u8 { var remain = out; var count: usize = 0; comptime var i: usize = 0; inline while (i < parts_len) : (i += 1) { const part = parts[i]; std.mem.copy(u8, remain, part); remain = remain[part.len..]; count += part.len; } return out[0..count]; } pub fn index(self: string, str: string) i32 { if (std.mem.indexOf(u8, self, str)) |i| { return @intCast(i32, i); } else { return -1; } } pub fn eqlUtf16(comptime self: string, other: []const u16) bool { return std.mem.eql(u16, std.unicode.utf8ToUtf16LeStringLiteral(self), other); } pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) !string { return try toUTF8AllocWithType(allocator, []const u16, js); } pub inline fn appendUTF8MachineWordToUTF16MachineWord(output: *[4]u16, input: *const [4]u8) void { output[0] = input[0]; output[1] = input[1]; output[2] = input[2]; output[3] = input[3]; } pub inline fn copyU8IntoU16(output_: []u16, input_: []const u8) void { var output = output_; var input = input_; if (comptime Environment.allow_assert) { std.debug.assert(input.len <= output.len); } while (input.len >= 4) { appendUTF8MachineWordToUTF16MachineWord(output[0..4], input[0..4]); output = output[4..]; input = input[4..]; } for (input) |c, i| { output[i] = c; } } pub inline fn copyU16IntoU8(output_: []u8, comptime InputType: type, input_: InputType) void { var output = output_; var input = input_; if (comptime Environment.allow_assert) { std.debug.assert(input.len <= output.len); } while (input.len >= 4) { output[0] = @intCast(u8, input[0]); output[1] = @intCast(u8, input[1]); output[2] = @intCast(u8, input[2]); output[3] = @intCast(u8, input[3]); output = output[4..]; input = input[4..]; } for (input) |c, i| { output[i] = @intCast(u8, c); } } const strings = @This(); pub fn toUTF16Alloc(allocator: std.mem.Allocator, bytes: []const u8, comptime fail_if_invalid: bool) !?[]u16 { if (strings.firstNonASCII(bytes)) |i| { const ascii = bytes[0..i]; const chunk = bytes[i..]; var output = try std.ArrayList(u16).initCapacity(allocator, ascii.len + 2); errdefer output.deinit(); output.items.len = ascii.len; strings.copyU8IntoU16(output.items, ascii); var remaining = chunk; { var sequence: [4]u8 = undefined; if (remaining.len >= 4) { sequence = remaining[0..4].*; } else { sequence[0] = remaining[0]; sequence[1] = if (remaining.len > 1) remaining[1] else 0; sequence[2] = if (remaining.len > 2) remaining[2] else 0; sequence[3] = 0; } const replacement = strings.convertUTF8BytesIntoUTF16(&sequence); remaining = remaining[@maximum(replacement.len, 1)..]; const new_len = strings.u16Len(replacement.code_point); try output.ensureUnusedCapacity(new_len); output.items.len += @as(usize, new_len); if (comptime fail_if_invalid) { if (replacement.code_point == unicode_replacement) return error.InvalidByteSequence; } switch (replacement.code_point) { 0...0xffff => { output.items[output.items.len - 1] = @intCast(u16, replacement.code_point); }, else => |c| { output.items[output.items.len - 2 .. output.items.len][0..2].* = [2]u16{ strings.u16Lead(c), strings.u16Trail(c) }; }, } } while (strings.firstNonASCII(remaining)) |j| { const last = remaining[0..j]; remaining = remaining[j..]; var sequence: [4]u8 = undefined; if (remaining.len >= 4) { sequence = remaining[0..4].*; } else { sequence[0] = remaining[0]; sequence[1] = if (remaining.len > 1) remaining[1] else 0; sequence[2] = if (remaining.len > 2) remaining[2] else 0; sequence[3] = 0; } const replacement = strings.convertUTF8BytesIntoUTF16(&sequence); remaining = remaining[@maximum(replacement.len, 1)..]; const new_len = j + @as(usize, strings.u16Len(replacement.code_point)); try output.ensureUnusedCapacity(new_len); output.items.len += new_len; strings.copyU8IntoU16(output.items[output.items.len - new_len ..][0..j], last); if (comptime fail_if_invalid) { if (replacement.code_point == unicode_replacement) return error.InvalidByteSequence; } switch (replacement.code_point) { 0...0xffff => { output.items[output.items.len - 1] = @intCast(u16, replacement.code_point); }, else => |c| { output.items[output.items.len - 2 .. output.items.len][0..2].* = [2]u16{ strings.u16Lead(c), strings.u16Trail(c) }; }, } } if (remaining.len > 0) { try output.ensureUnusedCapacity(remaining.len); output.items.len += remaining.len; strings.copyU8IntoU16(output.items[output.items.len - remaining.len ..], remaining); } return output.toOwnedSlice(); } return null; } pub fn utf16Codepoint(comptime Type: type, input: Type) UTF16Replacement { const c0 = @as(u21, input[0]); if (c0 & ~@as(u21, 0x03ff) == 0xd800) { // surrogate pair if (input.len == 1) return .{ .len = 1, }; //error.DanglingSurrogateHalf; const c1 = @as(u21, input[1]); if (c1 & ~@as(u21, 0x03ff) != 0xdc00) if (input.len == 1) return .{ .len = 1, }; // return error.ExpectedSecondSurrogateHalf; return .{ .len = 2, .code_point = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)) }; } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { // return error.UnexpectedSecondSurrogateHalf; return .{ .len = 1 }; } else { return .{ .code_point = c0, .len = 1 }; } } pub fn toUTF8AllocWithType(allocator: std.mem.Allocator, comptime Type: type, utf16: Type) ![]u8 { var list = std.ArrayList(u8).initCapacity(allocator, utf16.len) catch unreachable; var utf16_remaining = utf16; while (firstNonASCII16(Type, utf16_remaining)) |i| { const to_copy = utf16_remaining[0..i]; utf16_remaining = utf16_remaining[i..]; const replacement = utf16Codepoint(Type, utf16_remaining); utf16_remaining = utf16_remaining[replacement.len..]; const count: usize = switch (replacement.code_point) { 0...0x7F => 1, (0x7F + 1)...0x7FF => 2, (0x7FF + 1)...0xFFFF => 3, else => 4, }; try list.ensureUnusedCapacity(i + count); list.items.len += i; copyU16IntoU8( list.items[list.items.len - i ..], Type, to_copy, ); list.items.len += count; _ = encodeWTF8RuneT( list.items.ptr[list.items.len - count .. list.items.len - count + 4][0..4], u32, @as(u32, replacement.code_point), ); } try list.ensureUnusedCapacity(utf16_remaining.len); const old_len = list.items.len; list.items.len += utf16_remaining.len; copyU16IntoU8(list.items[old_len..], Type, utf16_remaining); return list.toOwnedSlice(); } pub const EncodeIntoResult = struct { read: u32 = 0, written: u32 = 0, }; pub fn allocateLatin1IntoUTF8(allocator: std.mem.Allocator, comptime Type: type, latin1_: Type) ![]u8 { var list = try std.ArrayList(u8).initCapacity(allocator, latin1_.len); var latin1 = latin1_; while (latin1.len > 0) { var read: usize = 0; var break_outer = false; outer: while (latin1.len >= 128) { try list.ensureUnusedCapacity(128); var base = list.items.ptr[list.items.len..list.items.len].ptr; comptime var count: usize = 0; inline while (count < 8) : (count += 1) { const vec: AsciiVector = latin1[(comptime count * ascii_vector_size)..][0..ascii_vector_size].*; const cmp = vec > max_16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < 16) { latin1 = latin1[(comptime count * ascii_vector_size)..]; list.items.len += (comptime count * ascii_vector_size); try list.appendSlice(latin1[0..first]); latin1 = latin1[first..]; break_outer = true; break :outer; } base[(comptime count * ascii_vector_size)..(comptime count * ascii_vector_size + ascii_vector_size)][0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec); } latin1 = latin1[(comptime count * ascii_vector_size)..]; list.items.len += (comptime count * ascii_vector_size); } if (!break_outer) { while (latin1.len >= ascii_vector_size) { const vec: AsciiVector = latin1[0..ascii_vector_size].*; const cmp = vec > max_16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < 16) { try list.appendSlice(latin1[0..first]); latin1 = latin1[first..]; break; } try list.ensureUnusedCapacity(ascii_vector_size); list.items.len += ascii_vector_size; list.items[list.items.len - ascii_vector_size ..][0..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec); latin1 = latin1[ascii_vector_size..]; } } while (read < latin1.len and latin1[read] < 0x80) : (read += 1) {} try list.appendSlice(latin1[0..read]); latin1 = latin1[read..]; if (latin1.len > 0) { try list.ensureUnusedCapacity(2); var buf = list.items.ptr[list.items.len .. list.items.len + 2][0..2]; list.items.len += 2; buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); latin1 = latin1[1..]; } } return list.toOwnedSlice(); } pub const UTF16Replacement = struct { code_point: u32 = unicode_replacement, len: u3 = 0, }; // This variation matches WebKit behavior. pub fn convertUTF8BytesIntoUTF16(sequence: *const [4]u8) UTF16Replacement { if (Environment.allow_assert) assert(sequence[0] > 127); const len = wtf8ByteSequenceLengthWithInvalid(sequence[0]); switch (len) { 2 => { if (Environment.allow_assert) assert(sequence[0] >= 0xC2); if (Environment.allow_assert) assert(sequence[0] <= 0xDF); if (sequence[1] < 0x80 or sequence[1] > 0xBF) { return .{ .len = 1 }; } return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 6) + @as(u32, sequence[1])) - 0x00003080 }; }, 3 => { if (Environment.allow_assert) assert(sequence[0] >= 0xE0); if (Environment.allow_assert) assert(sequence[0] <= 0xEF); switch (sequence[0]) { 0xE0 => { if (sequence[1] < 0xA0 or sequence[1] > 0xBF) { return .{ .len = 1 }; } }, 0xED => { if (sequence[1] < 0x80 or sequence[1] > 0x9F) { return .{ .len = 1 }; } }, else => { if (sequence[1] < 0x80 or sequence[1] > 0xBF) { return .{ .len = 1 }; } }, } if (sequence[2] < 0x80 or sequence[2] > 0xBF) { return .{ .len = 2 }; } return .{ .len = len, .code_point = ((@as(u32, sequence[0]) << 12) + (@as(u32, sequence[1]) << 6) + @as(u32, sequence[2])) - 0x000E2080, }; }, 4 => { if (Environment.allow_assert) assert(sequence[0] >= 0xF0); if (Environment.allow_assert) assert(sequence[0] <= 0xF4); switch (sequence[0]) { 0xF0 => { if (sequence[1] < 0x90 or sequence[1] > 0xBF) { return .{ .len = 1 }; } }, 0xF4 => { if (sequence[1] < 0x80 or sequence[1] > 0x8F) { return .{ .len = 1 }; } }, else => { if (sequence[1] < 0x80 or sequence[1] > 0xBF) { return .{ .len = 1 }; } }, } if (sequence[2] < 0x80 or sequence[2] > 0xBF) { return .{ .len = 2 }; } if (sequence[3] < 0x80 or sequence[3] > 0xBF) { return .{ .len = 3 }; } return .{ .len = 4, .code_point = ((@as(u32, sequence[0]) << 18) + (@as(u32, sequence[1]) << 12) + (@as(u32, sequence[2]) << 6) + @as(u32, sequence[3])) - 0x03C82080, }; }, else => unreachable, } } pub fn copyLatin1IntoUTF8(buf_: []u8, comptime Type: type, latin1_: Type) EncodeIntoResult { var buf = buf_; var latin1 = latin1_; while (buf.len > 0 and latin1.len > 0) { var read: usize = 0; var break_outer = false; outer: while (latin1.len >= 128 and buf.len >= 128) { comptime var count: usize = 0; inline while (count < 8) : (count += 1) { const vec: AsciiVector = latin1[(comptime count * ascii_vector_size)..][0..ascii_vector_size].*; const cmp = vec > max_16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < 16) { latin1 = latin1[(comptime count * ascii_vector_size)..]; buf = buf[(comptime count * ascii_vector_size)..]; break_outer = true; break :outer; } buf[(comptime count * ascii_vector_size)..][0..8].* = @bitCast([ascii_vector_size]u8, vec)[0..8].*; buf[(comptime count * ascii_vector_size)..][8..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[8..ascii_vector_size].*; } latin1 = latin1[(comptime count * ascii_vector_size)..]; buf = buf[(comptime count * ascii_vector_size)..]; } while (latin1.len > ascii_vector_size and !break_outer) { const vec: AsciiVector = latin1[0..ascii_vector_size].*; const cmp = vec > max_16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < 16) { break; } buf[0..8].* = @bitCast([ascii_vector_size]u8, vec)[0..8].*; buf[8..ascii_vector_size].* = @bitCast([ascii_vector_size]u8, vec)[8..ascii_vector_size].*; latin1 = latin1[ascii_vector_size..]; buf = buf[ascii_vector_size..]; } while (read < latin1.len and latin1[read] < 0x80) : (read += 1) {} const written = @minimum(read, buf.len); if (written == 0) break; @memcpy(buf.ptr, latin1.ptr, written); latin1 = latin1[written..]; buf = buf[written..]; if (latin1.len > 0 and buf.len >= 2) { buf[0..2].* = latin1ToCodepointBytesAssumeNotASCII(latin1[0]); latin1 = latin1[1..]; buf = buf[2..]; } } return .{ .read = @truncate(u32, buf_.len - buf.len), .written = @truncate(u32, latin1_.len - latin1.len), }; } test "copyLatin1IntoUTF8" { var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!"; var output = std.mem.zeroes([500]u8); const result = copyLatin1IntoUTF8(&output, string, input); try std.testing.expectEqual(input.len, result.read); try std.testing.expectEqual(input.len, result.written); try std.testing.expectEqualSlices(u8, input, output[0..result.written]); } pub fn latin1ToCodepointAssumeNotASCII(char: u8, comptime CodePointType: type) CodePointType { return @intCast( CodePointType, @bitCast( u16, latin1ToCodepointBytesAssumeNotASCII(char), ), ); } pub fn latin1ToCodepointBytesAssumeNotASCII(char: u32) [2]u8 { return [2]u8{ @truncate(u8, 0xc0 | char >> 6), @truncate(u8, 0x80 | (char & 0x3f)), }; } pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type) EncodeIntoResult { var remaining = buf; var utf16_remaining = utf16; var ended_on_non_ascii = false; while (firstNonASCII16(Type, utf16_remaining)) |i| { const end = @minimum(i, remaining.len); const to_copy = utf16_remaining[0..end]; copyU16IntoU8(remaining, Type, to_copy); remaining = remaining[end..]; utf16_remaining = utf16_remaining[end..]; if (@minimum(utf16_remaining.len, remaining.len) == 0) break; const replacement = utf16Codepoint(Type, utf16_remaining); const width: usize = switch (replacement.code_point) { 0...0x7F => 1, (0x7F + 1)...0x7FF => 2, (0x7FF + 1)...0xFFFF => 3, else => 4, }; if (width > remaining.len) { ended_on_non_ascii = width > 1; break; } utf16_remaining = utf16_remaining[replacement.len..]; _ = encodeWTF8RuneT(remaining.ptr[0..4], u32, @as(u32, replacement.code_point)); remaining = remaining[width..]; } if (remaining.len > 0 and !ended_on_non_ascii and utf16_remaining.len > 0) { const len = @minimum(remaining.len, utf16_remaining.len); copyU16IntoU8(remaining[0..len], Type, utf16_remaining[0..len]); utf16_remaining = utf16_remaining[len..]; remaining = remaining[len..]; } return .{ .read = @truncate(u32, utf16.len - utf16_remaining.len), .written = @truncate(u32, buf.len - remaining.len), }; } // Check utf16 string equals utf8 string without allocating extra memory pub fn utf16EqlString(text: []const u16, str: string) bool { if (text.len > str.len) { // Strings can't be equal if UTF-16 encoding is longer than UTF-8 encoding return false; } var temp = [4]u8{ 0, 0, 0, 0 }; const n = text.len; var j: usize = 0; var i: usize = 0; // TODO: is it safe to just make this u32 or u21? var r1: i32 = undefined; var k: u4 = 0; while (i < n) : (i += 1) { r1 = text[i]; if (r1 >= 0xD800 and r1 <= 0xDBFF and i + 1 < n) { const r2: i32 = text[i + 1]; if (r2 >= 0xDC00 and r2 <= 0xDFFF) { r1 = (r1 - 0xD800) << 10 | (r2 - 0xDC00) + 0x10000; i += 1; } } const width = encodeWTF8Rune(&temp, r1); if (j + width > str.len) { return false; } k = 0; while (k < width) : (k += 1) { if (temp[k] != str[j]) { return false; } j += 1; } } return j == str.len; } // This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info. pub fn encodeWTF8Rune(p: *[4]u8, r: i32) u3 { return @call( .{ .modifier = .always_inline, }, encodeWTF8RuneT, .{ p, u32, @intCast(u32, r), }, ); } pub fn encodeWTF8RuneT(p: *[4]u8, comptime R: type, r: R) u3 { switch (r) { 0...0x7F => { p[0] = @intCast(u8, r); return 1; }, (0x7F + 1)...0x7FF => { p[0] = @truncate(u8, 0xC0 | ((r >> 6))); p[1] = @truncate(u8, 0x80 | (r & 0x3F)); return 2; }, (0x7FF + 1)...0xFFFF => { p[0] = @truncate(u8, 0xE0 | ((r >> 12))); p[1] = @truncate(u8, 0x80 | ((r >> 6) & 0x3F)); p[2] = @truncate(u8, 0x80 | (r & 0x3F)); return 3; }, else => { p[0] = @truncate(u8, 0xF0 | ((r >> 18))); p[1] = @truncate(u8, 0x80 | ((r >> 12) & 0x3F)); p[2] = @truncate(u8, 0x80 | ((r >> 6) & 0x3F)); p[3] = @truncate(u8, 0x80 | (r & 0x3F)); return 4; }, } } pub inline fn wtf8ByteSequenceLength(first_byte: u8) u3 { return switch (first_byte) { 0 => 0, 1...0x80 - 1 => 1, else => if ((first_byte & 0xE0) == 0xC0) @as(u3, 2) else if ((first_byte & 0xF0) == 0xE0) @as(u3, 3) else if ((first_byte & 0xF8) == 0xF0) @as(u3, 4) else @as(u3, 1), }; } /// 0 == invalid pub inline fn wtf8ByteSequenceLengthWithInvalid(first_byte: u8) u3 { return switch (first_byte) { 0...0x80 - 1 => 1, else => if ((first_byte & 0xE0) == 0xC0) @as(u3, 2) else if ((first_byte & 0xF0) == 0xE0) @as(u3, 3) else if ((first_byte & 0xF8) == 0xF0) @as(u3, 4) else @as(u3, 0), }; } /// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. /// Invalid codepoints are replaced with `zero` parameter /// This is a clone of esbuild's decodeWTF8Rune /// which was a clone of golang's "utf8.DecodeRune" that was modified to decode using WTF-8 instead. /// Asserts a multi-byte codepoint pub inline fn decodeWTF8RuneTMultibyte(p: *const [4]u8, len: u3, comptime T: type, comptime zero: T) T { std.debug.assert(len > 1); const s1 = p[1]; if ((s1 & 0xC0) != 0x80) return zero; if (len == 2) { const cp = @as(T, p[0] & 0x1F) << 6 | @as(T, s1 & 0x3F); if (cp < 0x80) return zero; return cp; } const s2 = p[2]; if ((s2 & 0xC0) != 0x80) return zero; if (len == 3) { const cp = (@as(T, p[0] & 0x0F) << 12) | (@as(T, s1 & 0x3F) << 6) | (@as(T, s2 & 0x3F)); if (cp < 0x800) return zero; return cp; } const s3 = p[3]; { const cp = (@as(T, p[0] & 0x07) << 18) | (@as(T, s1 & 0x3F) << 12) | (@as(T, s2 & 0x3F) << 6) | (@as(T, s3 & 0x3F)); if (cp < 0x10000 or cp > 0x10FFFF) return zero; return cp; } unreachable; } const ascii_vector_size = 16; const ascii_u16_vector_size = 8; const max_16_ascii = @splat(ascii_vector_size, @as(u8, 127)); const max_u16_ascii = @splat(ascii_u16_vector_size, @as(u16, 127)); const AsciiVector = std.meta.Vector(ascii_vector_size, u8); const AsciiU16Vector = std.meta.Vector(ascii_u16_vector_size, u16); const max_4_ascii = @splat(4, @as(u8, 127)); pub fn isAllASCII(slice: []const u8) bool { var remaining = slice; // The NEON SIMD unit is 128-bit wide and includes 16 128-bit registers that can be used as 32 64-bit registers if (comptime Environment.isAarch64 or Environment.isX64) { while (remaining.len >= 128) { comptime var count: usize = 0; inline while (count < 8) : (count += 1) { const vec: AsciiVector = remaining[(comptime count * ascii_vector_size)..][0..ascii_vector_size].*; if ((@reduce(.Or, vec > max_16_ascii))) { return false; } } remaining = remaining[comptime ascii_vector_size * count..]; } while (remaining.len >= ascii_vector_size) { const vec: AsciiVector = remaining[0..ascii_vector_size].*; if ((@reduce(.Or, vec > max_16_ascii))) { return false; } remaining = remaining[ascii_vector_size..]; } while (remaining.len >= 4) { const vec: std.meta.Vector(4, u8) = remaining[0..4].*; remaining = remaining[4..]; if (@reduce(.Or, vec > max_4_ascii)) { return false; } } } for (remaining) |char| { if (char > 127) { return false; } } return true; } //#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) pub inline fn u16Lead(supplementary: anytype) u16 { return @intCast(u16, (supplementary >> 10) + 0xd7c0); } //#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) pub inline fn u16Trail(supplementary: anytype) u16 { return @intCast(u16, (supplementary & 0x3ff) | 0xdc00); } //#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) pub inline fn u16Len(supplementary: anytype) u2 { return switch (@intCast(u32, supplementary)) { 0...0xffff => 1, else => 2, }; } pub fn firstNonASCII(slice: []const u8) ?u32 { var remaining = slice; if (comptime Environment.isAarch64 or Environment.isX64) { while (remaining.len >= 128) { comptime var count: usize = 0; inline while (count < 8) : (count += 1) { const vec: AsciiVector = remaining[(comptime count * ascii_vector_size)..][0..ascii_vector_size].*; const cmp = vec > max_16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < 16) { return @intCast(u32, (comptime count * ascii_vector_size) + @as(u32, first) + @intCast(u32, slice.len - remaining.len)); } } remaining = remaining[comptime ascii_vector_size * count..]; } while (remaining.len >= ascii_vector_size) { const vec: AsciiVector = remaining[0..ascii_vector_size].*; const cmp = vec > max_16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < 16) { return @as(u32, first) + @intCast(u32, slice.len - remaining.len); } remaining = remaining[ascii_vector_size..]; } } for (remaining) |char, i| { if (char > 127) { return @truncate(u32, i + (slice.len - remaining.len)); } } return null; } pub fn firstNonASCII16(comptime Slice: type, slice: Slice) ?u32 { var remaining = slice; if (comptime Environment.isAarch64 or Environment.isX64) { while (remaining.len >= 64) { comptime var count: usize = 0; inline while (count < 8) : (count += 1) { const vec: AsciiU16Vector = remaining[(comptime count * ascii_u16_vector_size)..][0..ascii_u16_vector_size].*; const cmp = vec > max_u16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < ascii_u16_vector_size) { return @intCast(u32, (comptime count * ascii_u16_vector_size) + @as(u32, first) + @intCast(u32, slice.len - remaining.len)); } } remaining = remaining[comptime ascii_u16_vector_size * count..]; } while (remaining.len >= ascii_u16_vector_size) { const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; const cmp = vec > max_u16_ascii; const bitmask = @ptrCast(*const u16, &cmp).*; const first = @ctz(u16, bitmask); if (first < ascii_u16_vector_size) { return @intCast(u32, @as(u32, first) + @intCast(u32, slice.len - remaining.len)); } remaining = remaining[ascii_u16_vector_size..]; } } for (remaining) |char, i| { if (char > 127) { return @truncate(u32, i + (slice.len - remaining.len)); } } return null; } test "isAllASCII" { const yes = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"; try std.testing.expectEqual(true, isAllASCII(yes)); const no = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"; try std.testing.expectEqual(false, isAllASCII(no)); } test "firstNonASCII" { const yes = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"; try std.testing.expectEqual(true, firstNonASCII(yes) == null); { const no = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123"; try std.testing.expectEqual(@as(u32, 50), firstNonASCII(no).?); } { const no = "aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd12312🙂3"; try std.testing.expectEqual(@as(u32, 366), firstNonASCII(no).?); } } test "firstNonASCII16" { @setEvalBranchQuota(99999); const yes = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(true, firstNonASCII16(@TypeOf(yes), yes) == null); { @setEvalBranchQuota(99999); const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(@as(u32, 50), firstNonASCII16(@TypeOf(no), no).?); } { @setEvalBranchQuota(99999); const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(@as(u32, 0), firstNonASCII16(@TypeOf(no), no).?); } { @setEvalBranchQuota(99999); const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("a🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(@as(u32, 1), firstNonASCII16(@TypeOf(no), no).?); } { @setEvalBranchQuota(99999); const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd12312🙂3")); try std.testing.expectEqual(@as(u32, 366), firstNonASCII16(@TypeOf(no), no).?); } } pub fn formatUTF16(slice_: []align(1) const u16, writer: anytype) !void { var slice = slice_; var chunk: [512 + 4]u8 = undefined; var chunk_i: u16 = 0; while (slice.len > 0) { if (chunk_i >= chunk.len - 5) { try writer.writeAll(chunk[0..chunk_i]); chunk_i = 0; } var cp: u32 = slice[0]; slice = slice[1..]; if (cp & ~@as(u32, 0x03ff) == 0xd800 and slice.len > 0) { cp = 0x10000 + (((cp & 0x03ff) << 10) | (slice[0] & 0x03ff)); slice = slice[1..]; } chunk_i += @as( u8, @call( .{ .modifier = .always_inline }, encodeWTF8RuneT, .{ chunk[chunk_i..][0..4], u32, cp }, ), ); } try writer.writeAll(chunk[0..chunk_i]); } test "print UTF16" { var err = std.io.getStdErr(); const utf16 = comptime std.unicode.utf8ToUtf16LeStringLiteral("❌ ✅ opkay "); try formatUTF16(utf16, err.writer()); // std.unicode.fmtUtf16le(utf16le: []const u16) } /// Convert potentially ill-formed UTF-8 or UTF-16 bytes to a Unicode Codepoint. /// - Invalid codepoints are replaced with `zero` parameter /// - Null bytes return 0 pub fn decodeWTF8RuneT(p: *const [4]u8, len: u3, comptime T: type, comptime zero: T) T { if (len == 0) return zero; if (len == 1) return p[0]; return decodeWTF8RuneTMultibyte(p, len, T, zero); } pub fn codepointSize(comptime R: type, r: R) u3 { return switch (r) { 0b0000_0000...0b0111_1111 => 1, 0b1100_0000...0b1101_1111 => 2, 0b1110_0000...0b1110_1111 => 3, 0b1111_0000...0b1111_0111 => 4, else => 0, }; } // /// Encode Type into UTF-8 bytes. // /// - Invalid unicode data becomes U+FFFD REPLACEMENT CHARACTER. // /// - // pub fn encodeUTF8RuneT(out: *[4]u8, comptime R: type, c: R) u3 { // switch (c) { // 0b0000_0000...0b0111_1111 => { // out[0] = @intCast(u8, c); // return 1; // }, // 0b1100_0000...0b1101_1111 => { // out[0] = @truncate(u8, 0b11000000 | (c >> 6)); // out[1] = @truncate(u8, 0b10000000 | c & 0b111111); // return 2; // }, // 0b1110_0000...0b1110_1111 => { // if (0xd800 <= c and c <= 0xdfff) { // // Replacement character // out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD }; // return 3; // } // out[0] = @truncate(u8, 0b11100000 | (c >> 12)); // out[1] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111); // out[2] = @truncate(u8, 0b10000000 | c & 0b111111); // return 3; // }, // 0b1111_0000...0b1111_0111 => { // out[0] = @truncate(u8, 0b11110000 | (c >> 18)); // out[1] = @truncate(u8, 0b10000000 | (c >> 12) & 0b111111); // out[2] = @truncate(u8, 0b10000000 | (c >> 6) & 0b111111); // out[3] = @truncate(u8, 0b10000000 | c & 0b111111); // return 4; // }, // else => { // // Replacement character // out[0..3].* = [_]u8{ 0xEF, 0xBF, 0xBD }; // return 3; // }, // } // } pub fn containsNonBmpCodePoint(text: string) bool { var iter = CodepointIterator.init(text); var curs = CodepointIterator.Cursor{}; while (iter.next(&curs)) { if (curs.c > 0xFFFF) { return true; } } return false; } // this is std.mem.trim except it doesn't forcibly change the slice to be const pub fn trim(slice: anytype, values_to_strip: []const u8) @TypeOf(slice) { var begin: usize = 0; var end: usize = slice.len; while (begin < end and std.mem.indexOfScalar(u8, values_to_strip, slice[begin]) != null) : (begin += 1) {} while (end > begin and std.mem.indexOfScalar(u8, values_to_strip, slice[end - 1]) != null) : (end -= 1) {} return slice[begin..end]; } pub fn containsNonBmpCodePointUTF16(_text: []const u16) bool { const n = _text.len; if (n > 0) { var i: usize = 0; var text = _text[0 .. n - 1]; while (i < n - 1) : (i += 1) { switch (text[i]) { // Check for a high surrogate 0xD800...0xDBFF => { // Check for a low surrogate switch (text[i + 1]) { 0xDC00...0xDFFF => { return true; }, else => {}, } }, else => {}, } } } return false; } pub fn join(slices: []const string, delimiter: string, allocator: std.mem.Allocator) !string { return try std.mem.join(allocator, delimiter, slices); } pub fn cmpStringsAsc(_: void, a: string, b: string) bool { return std.mem.order(u8, a, b) == .lt; } pub fn cmpStringsDesc(_: void, a: string, b: string) bool { return std.mem.order(u8, a, b) == .gt; } const sort_asc = std.sort.asc(u8); const sort_desc = std.sort.desc(u8); pub fn sortAsc(in: []string) void { std.sort.sort([]const u8, in, {}, cmpStringsAsc); } pub fn sortDesc(in: []string) void { std.sort.sort([]const u8, in, {}, cmpStringsDesc); } pub fn isASCIIHexDigit(c: u8) bool { return std.ascii.isDigit(c) or std.ascii.isXDigit(c); } pub fn toASCIIHexValue(character: u8) u8 { std.debug.assert(isASCIIHexDigit(character)); return switch (character) { 0...('A' - 1) => character - '0', else => (character - 'A' + 10) & 0xF, }; } pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 { return switch (first_byte) { 0b0000_0000...0b0111_1111 => 1, 0b1100_0000...0b1101_1111 => 2, 0b1110_0000...0b1110_1111 => 3, 0b1111_0000...0b1111_0111 => 4, else => 0, }; } pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type { return struct { const Iterator = @This(); bytes: []const u8, i: usize, next_width: usize = 0, width: u3 = 0, c: CodePointType = zeroValue, pub const Cursor = struct { i: u32 = 0, c: CodePointType = zeroValue, width: u3 = 0, }; pub fn init(str: string) Iterator { return Iterator{ .bytes = str, .i = 0, .c = zeroValue }; } pub fn initOffset(str: string, i: usize) Iterator { return Iterator{ .bytes = str, .i = i, .c = zeroValue }; } pub inline fn next(it: *const Iterator, cursor: *Cursor) bool { const pos: u32 = @as(u32, cursor.width) + cursor.i; if (pos >= it.bytes.len) { return false; } const cp_len = wtf8ByteSequenceLength(it.bytes[pos]); const error_char = comptime std.math.minInt(CodePointType); const codepoint = @as( CodePointType, switch (cp_len) { 0 => return false, 1 => it.bytes[pos], else => decodeWTF8RuneTMultibyte(it.bytes[pos..].ptr[0..4], cp_len, CodePointType, error_char), }, ); cursor.* = Cursor{ .i = pos, .c = if (error_char != codepoint) codepoint else unicode_replacement, .width = if (codepoint != error_char) cp_len else 1, }; return true; } inline fn nextCodepointSlice(it: *Iterator) []const u8 { const bytes = it.bytes; const prev = it.i; const next_ = prev + it.next_width; if (bytes.len <= next_) return ""; const cp_len = utf8ByteSequenceLength(bytes[next_]); it.next_width = cp_len; it.i = @minimum(next_, bytes.len); const slice = bytes[prev..][0..cp_len]; it.width = @intCast(u3, slice.len); return slice; } pub fn needsUTF8Decoding(slice: string) bool { var it = Iterator{ .bytes = slice, .i = 0 }; while (true) { const part = it.nextCodepointSlice(); @setRuntimeSafety(false); switch (part.len) { 0 => return false, 1 => continue, else => return true, } } } pub fn scanUntilQuotedValueOrEOF(iter: *Iterator, comptime quote: CodePointType) usize { while (iter.c > -1) { if (!switch (iter.nextCodepoint()) { quote => false, '\\' => brk: { if (iter.nextCodepoint() == quote) { continue; } break :brk true; }, else => true, }) { return iter.i + 1; } } return iter.i; } pub fn nextCodepoint(it: *Iterator) CodePointType { const slice = it.nextCodepointSlice(); it.c = switch (slice.len) { 0 => zeroValue, 1 => @intCast(CodePointType, slice[0]), 2 => @intCast(CodePointType, std.unicode.utf8Decode2(slice) catch unreachable), 3 => @intCast(CodePointType, std.unicode.utf8Decode3(slice) catch unreachable), 4 => @intCast(CodePointType, std.unicode.utf8Decode4(slice) catch unreachable), else => unreachable, }; return it.c; } /// Look ahead at the next n codepoints without advancing the iterator. /// If fewer than n codepoints are available, then return the remainder of the string. pub fn peek(it: *Iterator, n: usize) []const u8 { const original_i = it.i; defer it.i = original_i; var end_ix = original_i; var found: usize = 0; while (found < n) : (found += 1) { const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..]; end_ix += next_codepoint.len; } return it.bytes[original_i..end_ix]; } }; } pub const CodepointIterator = NewCodePointIterator(CodePoint, -1); pub const UnsignedCodepointIterator = NewCodePointIterator(u32, 0); pub fn NewLengthSorter(comptime Type: type, comptime field: string) type { return struct { const LengthSorter = @This(); pub fn lessThan(_: LengthSorter, lhs: Type, rhs: Type) bool { return @field(lhs, field).len < @field(rhs, field).len; } }; } test "join" { var string_list = &[_]string{ "abc", "def", "123", "hello" }; const list = try join(string_list, "-", std.heap.page_allocator); try std.testing.expectEqualStrings("abc-def-123-hello", list); } test "sortAsc" { var string_list = [_]string{ "abc", "def", "123", "hello" }; var sorted_string_list = [_]string{ "123", "abc", "def", "hello" }; var sorted_join = try join(&sorted_string_list, "-", std.heap.page_allocator); sortAsc(&string_list); var string_join = try join(&string_list, "-", std.heap.page_allocator); try std.testing.expectEqualStrings(sorted_join, string_join); } test "sortDesc" { var string_list = [_]string{ "abc", "def", "123", "hello" }; var sorted_string_list = [_]string{ "hello", "def", "abc", "123" }; var sorted_join = try join(&sorted_string_list, "-", std.heap.page_allocator); sortDesc(&string_list); var string_join = try join(&string_list, "-", std.heap.page_allocator); try std.testing.expectEqualStrings(sorted_join, string_join); } pub usingnamespace @import("exact_size_matcher.zig"); pub const unicode_replacement = 0xFFFD; pub const unicode_replacement_str = brk: { var out: [std.unicode.utf8CodepointSequenceLength(unicode_replacement) catch unreachable]u8 = undefined; _ = std.unicode.utf8Encode(unicode_replacement, &out) catch unreachable; break :brk out; }; test "eqlCaseInsensitiveASCII" { try std.testing.expect(eqlCaseInsensitiveASCII("abc", "ABC", true)); try std.testing.expect(eqlCaseInsensitiveASCII("abc", "abc", true)); try std.testing.expect(eqlCaseInsensitiveASCII("aBcD", "aBcD", true)); try std.testing.expect(!eqlCaseInsensitiveASCII("aBcD", "NOOO", true)); try std.testing.expect(!eqlCaseInsensitiveASCII("aBcD", "LENGTH CHECK", true)); }