diff options
author | 2021-10-25 00:52:07 -0700 | |
---|---|---|
committer | 2021-10-25 00:52:07 -0700 | |
commit | 42c264bf7b45bdf7944d10260beeaf7c8b50a21a (patch) | |
tree | aaf099275cbafcf0b253a48dda30db2dc987ed66 | |
parent | fe6564b5332a72116f68c1c95ae7da86fe2ca668 (diff) | |
download | bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.tar.gz bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.tar.zst bun-42c264bf7b45bdf7944d10260beeaf7c8b50a21a.zip |
Fix all known string encoding bugs
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | build.zig | 2 | ||||
-rw-r--r-- | integration/scripts/browser.js | 80 | ||||
-rw-r--r-- | integration/snippets/string-escapes.js | 4 | ||||
-rw-r--r-- | src/http.zig | 13 | ||||
-rw-r--r-- | src/js_ast.zig | 6 | ||||
-rw-r--r-- | src/js_lexer.zig | 41 | ||||
-rw-r--r-- | src/js_printer.zig | 183 | ||||
-rw-r--r-- | src/string_immutable.zig | 53 |
9 files changed, 265 insertions, 122 deletions
@@ -719,4 +719,7 @@ run-unit: @zig-out/bin/$(testname) -- fake -test: build-unit run-unit
\ No newline at end of file +test: build-unit run-unit + +integration-test-dev: + USE_EXISTING_PROCESS=true node integration/scripts/browser.js
\ No newline at end of file @@ -286,7 +286,7 @@ pub fn build(b: *std.build.Builder) !void { obj.setBuildMode(mode); obj.linkLibC(); obj.linkLibCpp(); - + obj.strip = false; obj.bundle_compiler_rt = true; if (target.getOsTag() == .linux) { diff --git a/integration/scripts/browser.js b/integration/scripts/browser.js index eae77291c..10a07b54e 100644 --- a/integration/scripts/browser.js +++ b/integration/scripts/browser.js @@ -6,35 +6,38 @@ const fs = require("fs"); const child_process = require("child_process"); const snippetsDir = path.resolve(__dirname, "../snippets"); const serverURL = process.env.TEST_SERVER_URL || "http://localhost:8080"; - +const USE_EXISTING_PROCESS = process.env.USE_EXISTING_PROCESS || false; const DISABLE_HMR = !!process.env.DISABLE_HMR; const bunFlags = [ `--origin=${serverURL}`, DISABLE_HMR && "--disable-hmr", ].filter(Boolean); const bunExec = process.env.BUN_BIN || "bun"; -const bunProcess = child_process.spawn(bunExec, bunFlags, { - cwd: snippetsDir, - stdio: "pipe", - env: { - ...process.env, - DISABLE_BUN_ANALYTICS: "1", - }, - - shell: false, -}); -console.log("$", bunExec, bunFlags.join(" ")); -const isDebug = bunExec.endsWith("-debug"); -bunProcess.stderr.pipe(process.stderr); -bunProcess.stdout.pipe(process.stdout); -bunProcess.once("error", (err) => { - console.error("❌ bun error", err); - process.exit(1); -}); -process.on("beforeExit", () => { - bunProcess?.kill(0); -}); +var bunProcess; +if (!USE_EXISTING_PROCESS) { + bunProcess = child_process.spawn(bunExec, bunFlags, { + cwd: snippetsDir, + stdio: "pipe", + env: { + ...process.env, + DISABLE_BUN_ANALYTICS: "1", + }, + + shell: false, + }); + console.log("$", bunExec, bunFlags.join(" ")); + bunProcess.stderr.pipe(process.stderr); + bunProcess.stdout.pipe(process.stdout); + bunProcess.once("error", (err) => { + console.error("❌ bun error", err); + process.exit(1); + }); + process.on("beforeExit", () => { + bunProcess?.kill(0); + }); +} +const isDebug = bunExec.endsWith("-debug"); function writeSnapshot(name, code) { let file = path.join(__dirname, "../snapshots", name); @@ -61,7 +64,8 @@ function writeSnapshot(name, code) { } async function main() { - const browser = await puppeteer.launch(); + const launchOptions = USE_EXISTING_PROCESS ? { devtools: true } : undefined; + const browser = await puppeteer.launch(launchOptions); const promises = []; let allTestsPassed = true; @@ -69,6 +73,13 @@ async function main() { var page; try { page = await browser.newPage(); + if (USE_EXISTING_PROCESS) { + await page.evaluate(` + globalThis.BUN_DEBUG_MODE = true; + `); + } + + var shouldClose = true; page.on("console", (obj) => console.log(`[console.${obj.type()}] ${obj.text()}`) ); @@ -89,6 +100,7 @@ async function main() { console.log(`✅ ${key}`); } catch (e) { + if (USE_EXISTING_PROCESS) shouldClose = false; allTestsPassed = false; console.log(`❌ ${key}: ${(e && e.message) || e}`); } finally { @@ -102,7 +114,7 @@ async function main() { } } - await page.close(); + if (shouldClose) await page.close(); } const tests = require("./snippets.json"); @@ -112,16 +124,18 @@ async function main() { await runPage(test); } - await browser.close(); - bunProcess.kill(0); + if (!USE_EXISTING_PROCESS || (USE_EXISTING_PROCESS && allTestsPassed)) { + bunProcess && bunProcess.kill(0); - if (!allTestsPassed) { - console.error(`❌ browser test failed`); - process.exit(1); - } else { - console.log(`✅ browser test passed`); - bunProcess.kill(0); - process.exit(0); + if (!allTestsPassed) { + console.error(`❌ browser test failed`); + process.exit(1); + } else { + console.log(`✅ browser test passed`); + bunProcess && bunProcess.kill(0); + process.exit(0); + } + await browser.close(); } } diff --git a/integration/snippets/string-escapes.js b/integration/snippets/string-escapes.js index dc5c7cff9..fd821f892 100644 --- a/integration/snippets/string-escapes.js +++ b/integration/snippets/string-escapes.js @@ -1,14 +1,18 @@ var tab = "\t"; var シ = "wow"; var f = ""; +var f = "\u2087"; var obj = { "\r\n": "\r\n", "\n": "\n", "\t": "\t", + "\f": "\f", + "\v": "\v", "\u2028": "\u2028", "\u2029": "\u2029", "😊": "😊", "😃": "😃", + "🕵🏽♂️": "🕵🏽♂️", "㋡": "㋡", "☺": "☺", シ: "シ", diff --git a/src/http.zig b/src/http.zig index b7c28d0b8..e1fb5a4d4 100644 --- a/src/http.zig +++ b/src/http.zig @@ -1762,6 +1762,17 @@ pub const RequestContext = struct { threadlocal var buffer: MutableString = undefined; threadlocal var has_loaded_buffer: bool = false; + pub fn reserveNext(rctx: *SocketPrinterInternal, count: u32) anyerror![*]u8 { + try buffer.growIfNeeded(count); + return return @ptrCast([*]u8, &buffer.list.items.ptr[buffer.list.items.len]); + } + + pub fn advanceBy(rctx: *SocketPrinterInternal, count: u32) void { + if (comptime Environment.isDebug) std.debug.assert(buffer.list.items.len + count < buffer.list.capacity); + + buffer.list.items = buffer.list.items.ptr[0 .. buffer.list.items.len + count]; + } + pub fn init(rctx: *RequestContext, _loader: Options.Loader) SocketPrinterInternal { if (!has_loaded_buffer) { buffer = MutableString.init(default_allocator, 0) catch unreachable; @@ -1841,6 +1852,8 @@ pub const RequestContext = struct { SocketPrinterInternal.writeAll, SocketPrinterInternal.getLastByte, SocketPrinterInternal.getLastLastByte, + SocketPrinterInternal.reserveNext, + SocketPrinterInternal.advanceBy, ); const loader = ctx.bundler.options.loaders.get(result.file.input.name.ext) orelse .file; diff --git a/src/js_ast.zig b/src/js_ast.zig index e21281389..e926ae3b7 100644 --- a/src/js_ast.zig +++ b/src/js_ast.zig @@ -1097,15 +1097,15 @@ pub const E = struct { } pub inline fn isUTF8(s: *const String) bool { - return s.utf8.len > 0; + return @maximum(s.utf8.len, s.value.len) == s.utf8.len; } pub inline fn isBlank(s: *const String) bool { - return std.math.max(s.utf8.len, s.value.len) == 0; + return @maximum(s.utf8.len, s.value.len) == 0; } pub inline fn isPresent(s: *const String) bool { - return std.math.max(s.utf8.len, s.value.len) > 0; + return @maximum(s.utf8.len, s.value.len) > 0; } pub fn eql(s: *const String, comptime _t: type, other: anytype) bool { diff --git a/src/js_lexer.zig b/src/js_lexer.zig index 351adbfc6..a966358b8 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -137,7 +137,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { }; } - pub fn loc(self: *LexerType) logger.Loc { + pub inline fn loc(self: *const LexerType) logger.Loc { return logger.usize2Loc(self.start); } @@ -222,7 +222,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { pub fn deinit(this: *LexerType) void {} - pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { + fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { var buf = buf_.*; defer buf_.* = buf; if (comptime is_json) lexer.is_ascii_only = false; @@ -259,34 +259,36 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { const c2 = iter.c; const width2 = iter.width; - switch (iter.c) { + switch (c2) { + // https://mathiasbynens.be/notes/javascript-escapes#single 'b' => { - buf.append(std.mem.readIntNative(u16, "\\b")) catch unreachable; + buf.append(8) catch unreachable; continue; }, 'f' => { - buf.append(std.mem.readIntNative(u16, "\\f")) catch unreachable; + buf.append(9) catch unreachable; continue; }, 'n' => { - buf.append(std.mem.readIntNative(u16, "\\n")) catch unreachable; + buf.append(10) catch unreachable; continue; }, - 'r' => { - buf.append(std.mem.readIntNative(u16, "\\r")) catch unreachable; + 'v' => { + // Vertical tab is invalid JSON + // We're going to allow it. + // if (comptime is_json) { + // lexer.end = start + iter.i - width2; + // try lexer.syntaxError(); + // } + buf.append(11) catch unreachable; continue; }, 't' => { - buf.append(std.mem.readIntNative(u16, "\\t")) catch unreachable; + buf.append(12) catch unreachable; continue; }, - 'v' => { - if (comptime is_json) { - lexer.end = start + iter.i - width2; - try lexer.syntaxError(); - } - - buf.append(std.mem.readIntNative(u16, "\\v")) catch unreachable; + 'r' => { + buf.append(13) catch unreachable; continue; }, @@ -575,7 +577,8 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { } switch (lexer.code_point) { - 'f', 't', 'r', 'n', '`', '\'', '0', '"', 0x2028, 0x2029 => { + // 0 cannot be in this list because it may be a legacy octal literal + 'v', 'f', 't', 'r', 'n', '`', '\'', '"', 0x2028, 0x2029 => { try lexer.step(); continue :stringLiteral; }, @@ -673,7 +676,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { // Reset string literal const base = if (comptime quote == 0) lexer.start else lexer.start + 1; - lexer.string_literal_slice = lexer.source.contents[base..@minimum(lexer.source.contents.len, lexer.end - string_literal_details.suffix_len)]; + lexer.string_literal_slice = lexer.source.contents[base..@minimum(lexer.source.contents.len, lexer.end - @as(usize, string_literal_details.suffix_len))]; lexer.string_literal_is_ascii = !string_literal_details.needs_slow_path; lexer.string_literal_buffer.shrinkRetainingCapacity(0); if (string_literal_details.needs_slow_path) { @@ -2278,7 +2281,7 @@ pub fn NewLexer(comptime json_options: JSONOptions) type { // them. <CR><LF> and <CR> LineTerminatorSequences are normalized to // <LF> for both TV and TRV. An explicit EscapeSequence is needed to // include a <CR> or <CR><LF> sequence. - var bytes = MutableString.initCopy(lexer.allocator, text) catch unreachable; + var bytes = MutableString.init(lexer.allocator, text.len) catch unreachable; var end: usize = 0; var i: usize = 0; var c: u8 = '0'; diff --git a/src/js_printer.zig b/src/js_printer.zig index a3c439ca3..868757b7d 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -265,6 +265,29 @@ pub fn NewPrinter( } } + fn fmt(p: *Printer, comptime str: string, args: anytype) !void { + const len = @call( + .{ + .modifier = .always_inline, + }, + std.fmt.count, + .{ str, args }, + ); + var ptr = try p.writer.reserveNext( + len, + ); + + const written = @call( + .{ + .modifier = .always_inline, + }, + std.fmt.bufPrint, + .{ ptr[0..len], str, args }, + ) catch unreachable; + + p.writer.advance(written.len); + } + pub fn print(p: *Printer, str: anytype) void { switch (@TypeOf(str)) { comptime_int, u16, u8 => { @@ -538,12 +561,6 @@ pub fn NewPrinter( backtick_cost += 1; } }, - '\r', '\n' => { - if (comptime isDebug) { - std.debug.assert(allow_backtick); - } - return '`'; - }, else => {}, } i += 1; @@ -578,22 +595,22 @@ pub fn NewPrinter( } pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void { - // utf-8 is a max of 4 bytes - // we leave two extra chars for "\" and "u" - var temp = [6]u8{ 0, 0, 0, 0, 0, 0 }; var i: usize = 0; const n: usize = text.len; // e(text.len) catch unreachable; while (i < n) { - const c = @as(u21, text[i]); + const CodeUnitType = u21; + + const c = @as(CodeUnitType, text[i]); i += 1; - var r: u21 = 0; + var r: CodeUnitType = 0; var width: u3 = 0; // TODO: here switch (c) { + // Special-case the null character since it may mess with code written in C // that treats null characters as the end of the string. 0x00 => { @@ -605,6 +622,38 @@ pub fn NewPrinter( } }, + 'a'...'z', + 'A'...'Z', + '0'...'9', + '_', + '-', + '(', + '[', + '{', + '<', + '>', + ')', + ']', + '}', + ',', + ':', + ';', + '.', + '?', + '!', + '@', + '#', + '%', + '^', + '&', + '*', + '+', + '=', + ' ', + => { + e.print(@intCast(u8, c)); + }, + // Special-case the bell character since it may cause dumping this file to // the terminal to make a sound, which is undesirable. Note that we can't // use an octal literal to print this shorter since octal literals are not @@ -620,7 +669,7 @@ pub fn NewPrinter( }, '\n' => { if (quote == '`') { - e.print("\n"); + e.print('\n'); } else { e.print("\\n"); } @@ -633,32 +682,34 @@ pub fn NewPrinter( e.print("\\v"); }, // "\\" - 92 => { + '\\' => { e.print("\\\\"); }, + '\'' => { if (quote == '\'') { - e.print("\\"); + e.print('\\'); } e.print("'"); }, + '"' => { if (quote == '"') { - e.print("\\"); + e.print('\\'); } e.print("\""); }, '`' => { if (quote == '`') { - e.print("\\"); + e.print('\\'); } e.print("`"); }, '$' => { if (quote == '`' and i < n and text[i] == '{') { - e.print("\\"); + e.print('\\'); } e.print('$'); @@ -672,19 +723,16 @@ pub fn NewPrinter( 0xFEFF => { e.print("\\uFEFF"); }, + else => { switch (c) { - // Common case: just append a single byte - // we know it's not 0 since we already checked - 1...last_ascii => { - e.print(@intCast(u8, c)); - }, + first_high_surrogate...last_high_surrogate => { // Is there a next character? if (i < n) { - const c2 = text[i]; + const c2: CodeUnitType = @as(CodeUnitType, text[i]); if (c2 >= first_high_surrogate and c2 <= last_low_surrogate) { // this is some magic to me @@ -692,48 +740,62 @@ pub fn NewPrinter( i += 1; // Escape this character if UTF-8 isn't allowed if (ascii_only) { - // this is more magic!! - const bytes = [_]u8{ + var ptr = e.writer.reserve(12) catch unreachable; + ptr[0..12].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15], '\\', 'u', hex_chars[c2 >> 12], hex_chars[(c2 >> 8) & 15], hex_chars[(c2 >> 4) & 15], hex_chars[c2 & 15], }; - e.print(&bytes); + e.writer.advance(12); continue; // Otherwise, encode to UTF-8 } else { - width = std.unicode.utf8Encode(r, &temp) catch unreachable; - e.print(temp[0..width]); + var ptr = e.writer.reserve(4) catch unreachable; + e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, r)); continue; } } } - // Write an unpaired high surrogate - temp = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; - e.print(&temp); + { + // Write an unpaired high surrogate + var ptr = e.writer.reserve(6) catch unreachable; + ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; + e.writer.advance(6); + } }, // Is this an unpaired low surrogate or four-digit hex escape? first_low_surrogate...last_low_surrogate => { // Write an unpaired high surrogate - temp = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; - e.print(&temp); + var ptr = e.writer.reserve(6) catch unreachable; + ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; + e.writer.advance(6); }, else => { // this extra branch should get compiled if (ascii_only) { if (c > 0xFF) { + var ptr = e.writer.reserve(6) catch unreachable; // Write an unpaired high surrogate - temp = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; - e.print(&temp); + ptr[0..6].* = [_]u8{ '\\', 'u', hex_chars[c >> 12], hex_chars[(c >> 8) & 15], hex_chars[(c >> 4) & 15], hex_chars[c & 15] }; + e.writer.advance(6); } else { // Can this be a two-digit hex escape? - const quad = [_]u8{ '\\', 'x', hex_chars[c >> 4], hex_chars[c & 15] }; - e.print(&quad); + var ptr = e.writer.reserve(4) catch unreachable; + ptr[0..4].* = [_]u8{ '\\', 'x', hex_chars[c >> 4], hex_chars[c & 15] }; + e.writer.advance(4); } } else { - width = std.unicode.utf8Encode(c, &temp) catch unreachable; - e.print(temp[0..width]); + // chars < 255 as two digit hex escape + if (c < 0xFF) { + var ptr = e.writer.reserve(4) catch unreachable; + ptr[0..4].* = [_]u8{ '\\', 'x', hex_chars[c >> 4], hex_chars[c & 15] }; + e.writer.advance(4); + continue; + } + + var ptr = e.writer.reserve(4) catch return; + e.writer.advance(strings.encodeWTF8RuneT(ptr[0..4], CodeUnitType, c)); } }, } @@ -3849,6 +3911,8 @@ pub fn NewWriter( writeAllFn: fn (ctx: *ContextType, buf: anytype) anyerror!usize, getLastByte: fn (ctx: *const ContextType) u8, getLastLastByte: fn (ctx: *const ContextType) u8, + reserveNext: fn (ctx: *ContextType, count: u32) anyerror![*]u8, + advanceBy: fn (ctx: *ContextType, count: u32) void, ) type { return struct { const Self = @This(); @@ -3900,6 +3964,15 @@ pub fn NewWriter( return @call(.{ .modifier = .always_inline }, getLastLastByte, .{&writer.ctx}); } + pub fn reserve(writer: *Self, count: u32) anyerror![*]u8 { + return try reserveNext(&writer.ctx, count); + } + + pub fn advance(writer: *Self, count: u32) void { + advanceBy(&writer.ctx, count); + writer.written += @intCast(i32, count); + } + pub const Error = error{FormatError}; pub fn writeAll(writer: *Self, bytes: anytype) Error!usize { @@ -4015,6 +4088,16 @@ const FileWriterInternal = struct { return if (buffer.list.items.len > 1) buffer.list.items[buffer.list.items.len - 2] else 0; } + pub fn reserveNext(ctx: *FileWriterInternal, count: u32) anyerror![*]u8 { + try buffer.growIfNeeded(count); + return @ptrCast([*]u8, &buffer.list.items.ptr[buffer.list.items.len]); + } + pub fn advanceBy(ctx: *FileWriterInternal, count: u32) void { + if (comptime Environment.isDebug) std.debug.assert(buffer.list.items.len + count < buffer.list.capacity); + + buffer.list.items = buffer.list.items.ptr[0 .. buffer.list.items.len + count]; + } + pub fn done( ctx: *FileWriterInternal, ) anyerror!void { @@ -4101,6 +4184,16 @@ pub const BufferWriter = struct { return if (ctx.buffer.list.items.len > 1) ctx.buffer.list.items[ctx.buffer.list.items.len - 2] else 0; } + pub fn reserveNext(ctx: *BufferWriter, count: u32) anyerror![*]u8 { + try ctx.buffer.growIfNeeded(count); + return @ptrCast([*]u8, &ctx.buffer.list.items.ptr[ctx.buffer.list.items.len]); + } + pub fn advanceBy(ctx: *BufferWriter, count: u32) void { + if (comptime Environment.isDebug) std.debug.assert(ctx.buffer.list.items.len + count < ctx.buffer.list.capacity); + + ctx.buffer.list.items = ctx.buffer.list.items.ptr[0 .. ctx.buffer.list.items.len + count]; + } + pub fn reset(ctx: *BufferWriter) void { ctx.buffer.reset(); ctx.approximate_newline_count = 0; @@ -4127,8 +4220,18 @@ pub const BufferPrinter = NewWriter( BufferWriter.writeAll, BufferWriter.getLastByte, BufferWriter.getLastLastByte, + BufferWriter.reserveNext, + BufferWriter.advanceBy, +); +pub const FileWriter = NewWriter( + FileWriterInternal, + FileWriterInternal.writeByte, + FileWriterInternal.writeAll, + FileWriterInternal.getLastByte, + FileWriterInternal.getLastLastByte, + FileWriterInternal.reserveNext, + FileWriterInternal.advanceBy, ); -pub const FileWriter = NewWriter(FileWriterInternal, FileWriterInternal.writeByte, FileWriterInternal.writeAll, FileWriterInternal.getLastByte, FileWriterInternal.getLastLastByte); pub fn NewFileWriter(file: std.fs.File) FileWriter { var internal = FileWriterInternal.init(file); return FileWriter.init(internal); diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 0030b8708..9bfd8df77 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -549,39 +549,52 @@ pub fn utf16EqlString(text: []const u16, str: string) bool { // This is a clone of golang's "utf8.EncodeRune" that has been modified to encode using // WTF-8 instead. See https://simonsapin.github.io/wtf-8/ for more info. pub fn encodeWTF8Rune(p: []u8, r: i32) u3 { - // Negative values are erroneous. Making it unsigned addresses the problem. - const i = @intCast(u32, r); - switch (i) { + return @call( + .{ + .modifier = .always_inline, + }, + encodeWTF8RuneT, + .{ + p, + u32, + @intCast(u32, r), + }, + ); +} + +pub fn encodeWTF8RuneT(p: []u8, comptime R: type, r: R) u3 { + switch (r) { 0...0x7F => { p[0] = @intCast(u8, r); return 1; }, (0x7F + 1)...0x7FF => { - p[0] = 0xC0 | @intCast(u8, r >> 6); - p[1] = 0x80 | @intCast(u8, r) & 0x3F; + p[0] = @truncate(u8, 0xC0 | ((r >> 6))); + p[1] = @truncate(u8, 0x80 | (r & 0x3F)); return 2; }, (0x7FF + 1)...0xFFFF => { - p[0] = 0xE0 | @intCast(u8, r >> 12); - p[1] = 0x80 | @intCast(u8, r >> 6) & 0x3F; - p[2] = 0x80 | @intCast(u8, r) & 0x3F; + p[0] = @truncate(u8, 0xE0 | ((r >> 12))); + p[1] = @truncate(u8, 0x80 | ((r >> 6) & 0x3F)); + p[2] = @truncate(u8, 0x80 | (r & 0x3F)); return 3; }, else => { - p[0] = 0xF0 | @intCast(u8, r >> 18); - p[1] = 0x80 | @intCast(u8, r >> 12) & 0x3F; - p[2] = 0x80 | @intCast(u8, r >> 6) & 0x3F; - p[3] = 0x80 | @intCast(u8, r) & 0x3F; + p[0] = @truncate(u8, 0xF0 | ((r >> 18))); + p[1] = @truncate(u8, 0x80 | ((r >> 12) & 0x3F)); + p[2] = @truncate(u8, 0x80 | ((r >> 6) & 0x3F)); + p[3] = @truncate(u8, 0x80 | (r & 0x3F)); return 4; }, } } pub fn containsNonBmpCodePoint(text: string) bool { - var iter = std.unicode.Utf8Iterator{ .bytes = text, .i = 0 }; + var iter = CodepointIterator.init(text); + var curs = CodepointIterator.Cursor{}; - while (iter.nextCodepoint()) |codepoint| { - if (codepoint > 0xFFFF) { + while (iter.next(&curs)) { + if (curs.c > 0xFFFF) { return true; } } @@ -668,16 +681,6 @@ pub inline fn utf8ByteSequenceLength(first_byte: u8) u3 { }; } -pub inline fn utf8ByteSequenceLength32(first_byte: u8) u32 { - return switch (first_byte) { - 0b0000_0000...0b0111_1111 => 1, - 0b1100_0000...0b1101_1111 => 2, - 0b1110_0000...0b1110_1111 => 3, - 0b1111_0000...0b1111_0111 => 4, - else => 0, - }; -} - pub fn NewCodePointIterator(comptime CodePointType: type, comptime zeroValue: comptime_int) type { return struct { const Iterator = @This(); |