diff options
author | 2022-06-05 04:44:05 -0700 | |
---|---|---|
committer | 2022-06-05 04:44:05 -0700 | |
commit | ab04e82f55eb14347b08d9ec98399a1546b3b306 (patch) | |
tree | 84ee45cd0a80afe40973f607405b8b8fdb20706f | |
parent | 5aa196b361f58b4ba70d21464b4f0995164e269c (diff) | |
download | bun-jarred/escapeHTML.tar.gz bun-jarred/escapeHTML.tar.zst bun-jarred/escapeHTML.zip |
good enough for nowjarred/escapeHTML
-rw-r--r-- | bench/snippets/escapeHTML.js | 56 | ||||
-rw-r--r-- | integration/bunjs-only-snippets/escapeHTML.test.js | 68 | ||||
-rw-r--r-- | package.json | 4 | ||||
-rw-r--r-- | src/javascript/jsc/api/bun.zig | 57 | ||||
-rw-r--r-- | src/javascript/jsc/bindings/bindings.cpp | 21 | ||||
-rw-r--r-- | src/javascript/jsc/test/jest.zig | 12 | ||||
-rw-r--r-- | src/string_immutable.zig | 569 |
7 files changed, 601 insertions, 186 deletions
diff --git a/bench/snippets/escapeHTML.js b/bench/snippets/escapeHTML.js index 63e68861d..b6330b630 100644 --- a/bench/snippets/escapeHTML.js +++ b/bench/snippets/escapeHTML.js @@ -1,27 +1,10 @@ import { group } from "mitata"; import { bench, run } from "mitata"; +import { encode as htmlEntityEncode } from "html-entities"; +import { escape as heEscape } from "he"; var bunEscapeHTML_ = globalThis.escapeHTML || Bun.escapeHTML; -var bunEscapeHTML = function (str) { - if (str.length === 1) { - switch (str.charCodeAt(0)) { - case 34: // " - return """; - case 38: // & - return "&"; - case 39: // ' - return "'"; // modified from escape-html; used to be ''' - case 60: // < - return "<"; - case 62: // > - return ">"; - default: - return str; - } - } - - return bunEscapeHTML_(str); -}; +var bunEscapeHTML = bunEscapeHTML_; const matchHtmlRegExp = /["'&<>]/; @@ -44,6 +27,11 @@ const FIXTURE = require("fs") }) .join(""); +const FIXTURE_WITH_UNICODE = require("fs").readFileSync( + import.meta.dir + "/_fixture.txt", + "utf8" +); + function reactEscapeHtml(string) { const str = "" + string; const match = matchHtmlRegExp.exec(str); @@ -90,25 +78,29 @@ function reactEscapeHtml(string) { } for (let input of [ - // " ", - // "<script>alert('xss')</script>", - // "hello world", - // "hello world<script>alert('xss')</script>", - // "<", - // ">", - // `short value`, - `nothing to escape `.repeat(99999), + "<script>alert('xss')</script>", + `long string, nothing to escape... `.repeat(9999), + `long utf16 string, no esc 🤔🤔🤔🤔🤔` + "tex".repeat(4000), + `smol`, + // `medium string with <script>alert('xss')</script>`, + FIXTURE, + // "[unicode]" + FIXTURE_WITH_UNICODE, ]) { group( { summary: true, - name: `"` + input.substring(0, Math.min(input.length, 32)) + `"`, + name: + `"` + + input.substring(0, Math.min(input.length, 32)) + + `"` + + ` (${input.length} chars)`, }, () => { - bench(`react's escapeHTML`, () => reactEscapeHtml(input)); - - bench(`bun's escapeHTML`, () => bunEscapeHTML(input)); + bench(`ReactDOM.escapeHTML`, () => reactEscapeHtml(input)); + bench(`html-entities.encode`, () => htmlEntityEncode(input)); + bench(`he.escape`, () => heEscape(input)); + bench(`Bun.escapeHTML`, () => bunEscapeHTML(input)); } ); } diff --git a/integration/bunjs-only-snippets/escapeHTML.test.js b/integration/bunjs-only-snippets/escapeHTML.test.js index 13ff138c9..6c709bf76 100644 --- a/integration/bunjs-only-snippets/escapeHTML.test.js +++ b/integration/bunjs-only-snippets/escapeHTML.test.js @@ -2,7 +2,19 @@ import { describe, it, expect } from "bun:test"; import { gcTick } from "./gc"; describe("escapeHTML", () => { + // The matrix of cases we need to test for: + // 1. Works with short strings + // 2. Works with long strings + // 3. Works with latin1 strings + // 4. Works with utf16 strings + // 5. Works when the text to escape is somewhere in the middle + // 6. Works when the text to escape is in the beginning + // 7. Works when the text to escape is in the end + // 8. Returns the same string when there's no need to escape it("works", () => { + expect(escapeHTML("absolutely nothing to do here")).toBe( + "absolutely nothing to do here" + ); expect(escapeHTML("<script>alert(1)</script>")).toBe( "<script>alert(1)</script>" ); @@ -18,16 +30,10 @@ describe("escapeHTML", () => { expect(escapeHTML("\v")).toBe("\v"); expect(escapeHTML("\b")).toBe("\b"); expect(escapeHTML("\u00A0")).toBe("\u00A0"); + expect(escapeHTML("<script>ab")).toBe("<script>ab"); + expect(escapeHTML("<script>")).toBe("<script>"); + expect(escapeHTML("<script><script>")).toBe("<script><script>"); - // The matrix of cases we need to test for: - // 1. Works with short strings - // 2. Works with long strings - // 3. Works with latin1 strings - // 4. Works with utf16 strings - // 5. Works when the text to escape is somewhere in the middle - // 6. Works when the text to escape is in the beginning - // 7. Works when the text to escape is in the end - // 8. Returns the same string when there's no need to escape expect(escapeHTML("lalala" + "<script>alert(1)</script>" + "lalala")).toBe( "lalala<script>alert(1)</script>lalala" ); @@ -39,6 +45,13 @@ describe("escapeHTML", () => { "lalala" + "<script>alert(1)</script>" ); + expect(escapeHTML("What does 😊 mean?")).toBe("What does 😊 mean?"); + const output = escapeHTML("<What does 😊"); + expect(output).toBe("<What does 😊"); + expect(escapeHTML("<div>What does 😊 mean in text?")).toBe( + "<div>What does 😊 mean in text?" + ); + expect( escapeHTML( ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(900) @@ -50,5 +63,42 @@ describe("escapeHTML", () => { expect( escapeHTML(("lalala" + "<script>alert(1)</script>").repeat(900)) ).toBe(("lalala" + "<script>alert(1)</script>").repeat(900)); + + // the positions of the unicode codepoint are important + // our simd code for U16 is at 8 bytes, so we need to especially check the boundaries + expect( + escapeHTML("😊lalala" + "<script>alert(1)</script>" + "lalala") + ).toBe("😊lalala<script>alert(1)</script>lalala"); + expect(escapeHTML("<script>😊alert(1)</script>" + "lalala")).toBe( + "<script>😊alert(1)</script>lalala" + ); + expect(escapeHTML("<script>alert(1)😊</script>" + "lalala")).toBe( + "<script>alert(1)😊</script>lalala" + ); + expect(escapeHTML("<script>alert(1)</script>" + "😊lalala")).toBe( + "<script>alert(1)</script>😊lalala" + ); + expect(escapeHTML("<script>alert(1)</script>" + "lal😊ala")).toBe( + "<script>alert(1)</script>lal😊ala" + ); + expect( + escapeHTML("<script>alert(1)</script>" + "lal😊ala".repeat(10)) + ).toBe("<script>alert(1)</script>" + "lal😊ala".repeat(10)); + + for (let i = 1; i < 10; i++) + expect(escapeHTML("<script>alert(1)</script>" + "la😊".repeat(i))).toBe( + "<script>alert(1)</script>" + "la😊".repeat(i) + ); + + expect(escapeHTML("la😊" + "<script>alert(1)</script>")).toBe( + "la😊" + "<script>alert(1)</script>" + ); + expect( + escapeHTML(("lalala" + "<script>alert(1)</script>😊").repeat(1)) + ).toBe(("lalala" + "<script>alert(1)</script>😊").repeat(1)); + + expect(escapeHTML("😊".repeat(100))).toBe("😊".repeat(100)); + expect(escapeHTML("😊<".repeat(100))).toBe("😊<".repeat(100)); + expect(escapeHTML("<😊>".repeat(100))).toBe("<😊>".repeat(100)); }); }); diff --git a/package.json b/package.json index 1e7a82c3f..8e6b1d517 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "dependencies": { - "mitata": "^0.0.10", + "mitata": "^0.1.3", "peechy": "0.4.32", "react": "^17.0.2" }, @@ -9,6 +9,8 @@ "build-fallback": "esbuild --target=esnext --bundle src/fallback.ts --format=iife --platform=browser --minify > src/fallback.out.js" }, "devDependencies": { + "he": "^1.2.0", + "html-entities": "^2.3.3", "prettier": "^2.4.1", "typescript": "4.6.3" }, diff --git a/src/javascript/jsc/api/bun.zig b/src/javascript/jsc/api/bun.zig index 8228139de..72d61d2e9 100644 --- a/src/javascript/jsc/api/bun.zig +++ b/src/javascript/jsc/api/bun.zig @@ -1623,8 +1623,49 @@ pub export fn Bun__escapeHTML( const input_value = arguments[0]; const zig_str = input_value.getZigString(globalObject); + if (zig_str.len == 0) + return ZigString.Empty.toValue(globalObject); + if (zig_str.is16Bit()) { - return input_value; + var input_slice = zig_str.utf16SliceAligned(); + var escaped_html = strings.escapeHTMLForUTF16Input(globalObject.bunVM().allocator, input_slice) catch { + globalObject.vm().throwError(globalObject, ZigString.init("Out of memory").toValue(globalObject)); + return JSC.JSValue.jsUndefined(); + }; + + if (escaped_html.ptr == input_slice.ptr and escaped_html.len == input_slice.len) { + return input_value; + } + + if (input_slice.len == 1) { + // single character escaped strings are statically allocated + return ZigString.init(std.mem.sliceAsBytes(escaped_html)).to16BitValue(globalObject); + } + + if (comptime Environment.allow_assert) { + // assert that re-encoding the string produces the same result + std.debug.assert( + std.mem.eql( + u16, + (strings.toUTF16Alloc(bun.default_allocator, strings.toUTF8Alloc(bun.default_allocator, escaped_html) catch unreachable, false) catch unreachable).?, + escaped_html, + ), + ); + + // assert we do not allocate a new string unnecessarily + std.debug.assert( + !std.mem.eql( + u16, + input_slice, + escaped_html, + ), + ); + + // the output should always be longer than the input + std.debug.assert(escaped_html.len > input_slice.len); + } + + return ZigString.from16(escaped_html.ptr, escaped_html.len).toExternalValue(globalObject); } else { var input_slice = zig_str.slice(); var escaped_html = strings.escapeHTMLForLatin1Input(globalObject.bunVM().allocator, input_slice) catch { @@ -1641,6 +1682,20 @@ pub export fn Bun__escapeHTML( return ZigString.init(escaped_html).toValue(globalObject); } + if (comptime Environment.allow_assert) { + // the output should always be longer than the input + std.debug.assert(escaped_html.len > input_slice.len); + + // assert we do not allocate a new string unnecessarily + std.debug.assert( + !std.mem.eql( + u8, + input_slice, + escaped_html, + ), + ); + } + return ZigString.init(escaped_html).toExternalValue(globalObject); } } diff --git a/src/javascript/jsc/bindings/bindings.cpp b/src/javascript/jsc/bindings/bindings.cpp index 7505a8a8f..a4435f06b 100644 --- a/src/javascript/jsc/bindings/bindings.cpp +++ b/src/javascript/jsc/bindings/bindings.cpp @@ -1101,23 +1101,26 @@ static void free_global_string(void* str, void* ptr, unsigned len) JSC__JSValue ZigString__toExternalU16(const uint16_t* arg0, size_t len, JSC__JSGlobalObject* global) { - return JSC::JSValue::encode(JSC::JSValue(JSC::jsOwnedString( - global->vm(), - ExternalStringImpl::create(reinterpret_cast<const UChar*>(arg0), len, nullptr, free_global_string)))); + auto ref = String(ExternalStringImpl::create(reinterpret_cast<const UChar*>(arg0), len, nullptr, free_global_string)); + + return JSC::JSValue::encode(JSC::JSValue(JSC::jsString( + global->vm(), WTFMove(ref)))); } // This must be a globally allocated string JSC__JSValue ZigString__toExternalValue(const ZigString* arg0, JSC__JSGlobalObject* arg1) { ZigString str = *arg0; if (Zig::isTaggedUTF16Ptr(str.ptr)) { - return JSC::JSValue::encode(JSC::JSValue(JSC::jsOwnedString( + auto ref = String(ExternalStringImpl::create(reinterpret_cast<const UChar*>(Zig::untag(str.ptr)), str.len, nullptr, free_global_string)); + + return JSC::JSValue::encode(JSC::JSValue(JSC::jsString( + arg1->vm(), WTFMove(ref)))); + } else { + auto ref = String(ExternalStringImpl::create(Zig::untag(str.ptr), str.len, nullptr, free_global_string)); + return JSC::JSValue::encode(JSC::JSValue(JSC::jsString( arg1->vm(), - ExternalStringImpl::create(reinterpret_cast<const UChar*>(Zig::untag(str.ptr)), str.len, nullptr, free_global_string)))); + WTFMove(ref)))); } - - return JSC::JSValue::encode(JSC::JSValue(JSC::jsOwnedString( - arg1->vm(), - ExternalStringImpl::create(Zig::untag(str.ptr), str.len, nullptr, free_global_string)))); } VirtualMachine* JSC__JSGlobalObject__bunVM(JSC__JSGlobalObject* arg0) diff --git a/src/javascript/jsc/test/jest.zig b/src/javascript/jsc/test/jest.zig index a07a4bcac..db6353d21 100644 --- a/src/javascript/jsc/test/jest.zig +++ b/src/javascript/jsc/test/jest.zig @@ -367,7 +367,16 @@ pub const Expect = struct { this.scope.tests.items[this.test_id].counter.actual += 1; const left = JSValue.fromRef(arguments[0]); const right = JSValue.fromRef(this.value); + if (!left.isSameValue(right, ctx.ptr())) { + if (left.isString() and right.isString()) { + var left_slice = left.toSlice(ctx, getAllocator(ctx)); + defer left_slice.deinit(); + var right_slice = right.toSlice(ctx, getAllocator(ctx)); + defer right_slice.deinit(); + std.debug.assert(!strings.eqlLong(left_slice.slice(), right_slice.slice(), false)); + } + var lhs_formatter: JSC.ZigConsoleClient.Formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = ctx.ptr() }; var rhs_formatter: JSC.ZigConsoleClient.Formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = ctx.ptr() }; @@ -381,8 +390,10 @@ pub const Expect = struct { ctx, exception, ); + return null; } + return thisObject; } @@ -563,6 +574,7 @@ pub const ExpectPrototype = struct { .scope = DescribeScope.active, .test_id = DescribeScope.active.current_test_id, }; + expect_.value.?.value().ensureStillAlive(); return Expect.Class.make(ctx, expect_); } }; diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 849691ca2..dd5f28f8a 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -15,6 +15,10 @@ pub inline fn contains(self: string, str: string) bool { return std.mem.indexOf(u8, self, str) != null; } +pub fn toUTF16Literal(comptime str: []const u8) []const u16 { + return comptime std.unicode.utf8ToUtf16LeStringLiteral(str); +} + const OptionalUsize = std.meta.Int(.unsigned, @bitSizeOf(usize) - 1); pub fn indexOfAny(self: string, comptime str: anytype) ?OptionalUsize { for (self) |c, i| { @@ -324,7 +328,7 @@ test "eqlComptimeCheckLen" { } test "eqlComptimeUTF16" { - try std.testing.expectEqual(eqlComptimeUTF16(std.unicode.utf8ToUtf16LeStringLiteral("bun-darwin-aarch64.zip"), "bun-darwin-aarch64.zip"), true); + try std.testing.expectEqual(eqlComptimeUTF16(toUTF16Literal("bun-darwin-aarch64.zip"), "bun-darwin-aarch64.zip"), true); const sizes = [_]u16{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23, 22, 24 }; inline for (sizes) |size| { var buf: [size]u16 = undefined; @@ -542,7 +546,7 @@ pub fn eqlComptime(self: string, comptime alt: anytype) bool { } pub fn eqlComptimeUTF16(self: []const u16, comptime alt: []const u8) bool { - return eqlComptimeCheckLenWithType(u16, self, comptime std.unicode.utf8ToUtf16LeStringLiteral(alt), true); + return eqlComptimeCheckLenWithType(u16, self, comptime toUTF16Literal(alt), true); } pub fn eqlComptimeIgnoreLen(self: string, comptime alt: anytype) bool { @@ -703,7 +707,7 @@ pub fn index(self: string, str: string) i32 { } pub fn eqlUtf16(comptime self: string, other: []const u16) bool { - return std.mem.eql(u16, std.unicode.utf8ToUtf16LeStringLiteral(self), other); + return std.mem.eql(u16, toUTF16Literal(self), other); } pub fn toUTF8Alloc(allocator: std.mem.Allocator, js: []const u16) !string { @@ -1316,8 +1320,8 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize { } pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) ![]const u8 { - const Pusher = struct { - const lengths: [std.math.maxInt(u8)]u4 = brk: { + const Scalar = struct { + pub const lengths: [std.math.maxInt(u8)]u4 = brk: { var values: [std.math.maxInt(u8)]u4 = undefined; for (values) |_, i| { switch (i) { @@ -1365,19 +1369,26 @@ pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8 else => unreachable, }; } - pub inline fn push(comptime c: anytype, chars: []const u8, allo: std.mem.Allocator) []const u8 { + + pub inline fn push(comptime len: anytype, chars_: *const [len]u8, allo: std.mem.Allocator) []const u8 { + const chars = chars_.*; var total: usize = 0; - inline for (comptime bun.range(0, c)) |i| { - total += @as(usize, lengths[chars[i]]); + + comptime var remain_to_comp = len; + comptime var comp_i = 0; + + inline while (remain_to_comp > 0) : (remain_to_comp -= 1) { + total += lengths[chars[comp_i]]; + comp_i += 1; } - if (total == c) { - return chars; + if (total == len) { + return chars_; } var output = allo.alloc(u8, total) catch unreachable; var head = output.ptr; - inline for (comptime bun.range(0, c)) |i| { + inline for (comptime bun.range(0, len)) |i| { head += @This().append(head, chars[i]); } @@ -1417,19 +1428,38 @@ pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8 return strings.append(allocator, first, second); }, - 3 => return Pusher.push(3, latin1, allocator), - 4 => return Pusher.push(4, latin1, allocator), - 5 => return Pusher.push(5, latin1, allocator), - 6 => return Pusher.push(6, latin1, allocator), - 7 => return Pusher.push(7, latin1, allocator), - 8 => return Pusher.push(8, latin1, allocator), - 9 => return Pusher.push(9, latin1, allocator), - 10 => return Pusher.push(10, latin1, allocator), - 11 => return Pusher.push(11, latin1, allocator), - 12 => return Pusher.push(12, latin1, allocator), - 13 => return Pusher.push(13, latin1, allocator), - 14 => return Pusher.push(14, latin1, allocator), - 15 => return Pusher.push(15, latin1, allocator), + + // The simd implementation is slower for inputs less than 32 bytes. + 3 => return Scalar.push(3, latin1[0..3], allocator), + 4 => return Scalar.push(4, latin1[0..4], allocator), + 5 => return Scalar.push(5, latin1[0..5], allocator), + 6 => return Scalar.push(6, latin1[0..6], allocator), + 7 => return Scalar.push(7, latin1[0..7], allocator), + 8 => return Scalar.push(8, latin1[0..8], allocator), + 9 => return Scalar.push(9, latin1[0..9], allocator), + 10 => return Scalar.push(10, latin1[0..10], allocator), + 11 => return Scalar.push(11, latin1[0..11], allocator), + 12 => return Scalar.push(12, latin1[0..12], allocator), + 13 => return Scalar.push(13, latin1[0..13], allocator), + 14 => return Scalar.push(14, latin1[0..14], allocator), + 15 => return Scalar.push(15, latin1[0..15], allocator), + 16 => return Scalar.push(16, latin1[0..16], allocator), + 17 => return Scalar.push(17, latin1[0..17], allocator), + 18 => return Scalar.push(18, latin1[0..18], allocator), + 19 => return Scalar.push(19, latin1[0..19], allocator), + 20 => return Scalar.push(20, latin1[0..20], allocator), + 21 => return Scalar.push(21, latin1[0..21], allocator), + 22 => return Scalar.push(22, latin1[0..22], allocator), + 23 => return Scalar.push(23, latin1[0..23], allocator), + 24 => return Scalar.push(24, latin1[0..24], allocator), + 25 => return Scalar.push(25, latin1[0..25], allocator), + 26 => return Scalar.push(26, latin1[0..26], allocator), + 27 => return Scalar.push(27, latin1[0..27], allocator), + 28 => return Scalar.push(28, latin1[0..28], allocator), + 29 => return Scalar.push(29, latin1[0..29], allocator), + 30 => return Scalar.push(30, latin1[0..30], allocator), + 31 => return Scalar.push(31, latin1[0..31], allocator), + 32 => return Scalar.push(32, latin1[0..32], allocator), else => { var remaining = latin1; @@ -1454,40 +1484,52 @@ pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8 std.debug.assert(!any_needs_escape); } const vec: AsciiVector = remaining[0..ascii_vector_size].*; - if (@reduce(.Min, (vec ^ vecs[0]) & - (vec ^ vecs[1]) & - (vec ^ vecs[2]) & - (vec ^ vecs[3]) & - (vec ^ vecs[4])) == 0) + if (@reduce(.Max, @bitCast(AsciiVectorU1, (vec == vecs[0])) | + @bitCast(AsciiVectorU1, (vec == vecs[1])) | + @bitCast(AsciiVectorU1, (vec == vecs[2])) | + @bitCast(AsciiVectorU1, (vec == vecs[3])) | + @bitCast(AsciiVectorU1, (vec == vecs[4]))) == 1) { buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); @memcpy(buf.items.ptr, latin1.ptr, copy_len); buf.items.len = copy_len; any_needs_escape = true; - var i: usize = 0; - while (i < ascii_vector_size) : (i += 1) { + comptime var i: usize = 0; + inline while (i < ascii_vector_size) : (i += 1) { switch (vec[i]) { - '"', '&', '\'', '<', '>' => |c| { - const result = switch (c) { - '"' => """, - '&' => "&", - '\'' => "'", - '<' => "<", - '>' => ">", - else => unreachable, - }; - - buf.appendSlice(result) catch unreachable; - remaining = remaining[1..]; + '"' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*; + buf.items.len += """.len; + }, + '&' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*; + buf.items.len += "&".len; + }, + '\'' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*; + buf.items.len += "'".len; + }, + '<' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*; + buf.items.len += "<".len; + }, + '>' => { + buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*; + buf.items.len += ">".len; }, else => |c| { - buf.append(c) catch unreachable; - remaining = remaining[1..]; + buf.appendAssumeCapacity(c); }, } } + remaining = remaining[ascii_vector_size..]; break :scan_and_allocate_lazily; } @@ -1500,33 +1542,43 @@ pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8 // so we'll go ahead and copy the buffer into a new buffer while (remaining.len >= ascii_vector_size) { const vec: AsciiVector = remaining[0..ascii_vector_size].*; - if (@reduce(.Min, (vec ^ vecs[0]) & - (vec ^ vecs[1]) & - (vec ^ vecs[2]) & - (vec ^ vecs[3]) & - (vec ^ vecs[4])) == 0) + if (@reduce(.Max, @bitCast(AsciiVectorU1, (vec == vecs[0])) | + @bitCast(AsciiVectorU1, (vec == vecs[1])) | + @bitCast(AsciiVectorU1, (vec == vecs[2])) | + @bitCast(AsciiVectorU1, (vec == vecs[3])) | + @bitCast(AsciiVectorU1, (vec == vecs[4]))) == 1) { - buf.ensureUnusedCapacity(ascii_vector_size) catch unreachable; - var i: usize = 0; - while (i < ascii_vector_size) : (i += 1) { + buf.ensureUnusedCapacity(ascii_vector_size + 6) catch unreachable; + comptime var i: usize = 0; + inline while (i < ascii_vector_size) : (i += 1) { switch (vec[i]) { '"' => { - buf.appendSlice(""") catch unreachable; + buf.ensureUnusedCapacity((ascii_vector_size - i) + """.len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + """.len][0..""".len].* = """.*; + buf.items.len += """.len; }, '&' => { - buf.appendSlice("&") catch unreachable; + buf.ensureUnusedCapacity((ascii_vector_size - i) + "&".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "&".len][0.."&".len].* = "&".*; + buf.items.len += "&".len; }, '\'' => { - buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + buf.ensureUnusedCapacity((ascii_vector_size - i) + "'".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "'".len][0.."'".len].* = "'".*; + buf.items.len += "'".len; }, '<' => { - buf.appendSlice("<") catch unreachable; + buf.ensureUnusedCapacity((ascii_vector_size - i) + "<".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + "<".len][0.."<".len].* = "<".*; + buf.items.len += "<".len; }, '>' => { - buf.appendSlice(">") catch unreachable; + buf.ensureUnusedCapacity((ascii_vector_size - i) + ">".len) catch unreachable; + buf.items.ptr[buf.items.len .. buf.items.len + ">".len][0..">".len].* = ">".*; + buf.items.len += ">".len; }, else => |c| { - buf.append(c) catch unreachable; + buf.appendAssumeCapacity(c); }, } } @@ -1542,94 +1594,343 @@ pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8 } } + var ptr = remaining.ptr; + const end = remaining.ptr + remaining.len; + if (!any_needs_escape) { - scan_and_allocate_lazily: while (remaining.len > 0) { - switch (remaining[0]) { - '"' => { - const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); - @memcpy(buf.items.ptr, latin1.ptr, copy_len); + scan_and_allocate_lazily: while (ptr != end) : (ptr += 1) { + switch (ptr[0]) { + '"', '&', '\'', '<', '>' => |c| { + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + @as(usize, Scalar.lengths[c])); + const copy_len = @ptrToInt(ptr) - @ptrToInt(latin1.ptr); + @memcpy(buf.items.ptr, latin1.ptr, copy_len - 1); buf.items.len = copy_len; - buf.appendSlice(""") catch unreachable; - remaining = remaining[1..]; any_needs_escape = true; break :scan_and_allocate_lazily; }, - '&' => { - const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); - @memcpy(buf.items.ptr, latin1.ptr, copy_len); - buf.items.len = copy_len; - buf.appendSlice("&") catch unreachable; - remaining = remaining[1..]; - any_needs_escape = true; - break :scan_and_allocate_lazily; - }, - '\'' => { - const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); - @memcpy(buf.items.ptr, latin1.ptr, copy_len); - buf.items.len = copy_len; - buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' - remaining = remaining[1..]; - any_needs_escape = true; - break :scan_and_allocate_lazily; - }, - '<' => { - const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); - @memcpy(buf.items.ptr, latin1.ptr, copy_len); - buf.items.len = copy_len; - buf.appendSlice("<") catch unreachable; - remaining = remaining[1..]; + else => {}, + } + } + } + + while (ptr != end) : (ptr += 1) { + switch (ptr[0]) { + '"' => { + buf.appendSlice(""") catch unreachable; + }, + '&' => { + buf.appendSlice("&") catch unreachable; + }, + '\'' => { + buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + }, + '<' => { + buf.appendSlice("<") catch unreachable; + }, + '>' => { + buf.appendSlice(">") catch unreachable; + }, + else => |c| { + buf.append(c) catch unreachable; + }, + } + } + + if (!any_needs_escape) { + return latin1; + } + + return buf.toOwnedSlice(); + }, + } +} + +pub fn escapeHTMLForUTF16Input(allocator: std.mem.Allocator, utf16: []const u16) ![]const u16 { + const Scalar = struct { + pub const lengths: [std.math.maxInt(u8)]u4 = brk: { + var values: [std.math.maxInt(u8)]u4 = undefined; + for (values) |_, i| { + values[i] = switch (i) { + '"' => """.len, + '&' => "&".len, + '\'' => "'".len, + '<' => "<".len, + '>' => ">".len, + else => 1, + }; + } + + break :brk values; + }; + }; + switch (utf16.len) { + 0 => return &[_]u16{}, + 1 => return switch (utf16[0]) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => utf16, + }, + 2 => { + const first = std.mem.sliceAsBytes(switch (utf16[0]) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => @as([]const u16, utf16[0..1]), + }); + const second = std.mem.sliceAsBytes(switch (utf16[1]) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => @as([]const u16, utf16[1..2]), + }); + if (first.len == 1 and second.len == 1) { + return utf16; + } + const outlen = first.len + second.len; + var buf = allocator.alloc(u16, outlen / 2) catch unreachable; + var buf_ = std.mem.sliceAsBytes(buf); + @memcpy(buf_.ptr, first.ptr, first.len); + @memcpy(buf_.ptr + first.len, second.ptr, second.len); + return buf; + }, + + else => { + var remaining = utf16; + + var any_needs_escape = false; + var buf: std.ArrayList(u16) = undefined; + + if (comptime Environment.isAarch64 or Environment.isX64) { + const vec_chars = "\"&'<>"; + const vecs: [vec_chars.len]AsciiU16Vector = brk: { + var _vecs: [vec_chars.len]AsciiU16Vector = undefined; + for (vec_chars) |c, i| { + _vecs[i] = @splat(ascii_u16_vector_size, @as(u16, c)); + } + break :brk _vecs; + }; + // pass #1: scan for any characters that need escaping + // assume most strings won't need any escaping, so don't actually allocate the buffer + scan_and_allocate_lazily: while (remaining.len >= ascii_u16_vector_size) { + if (comptime Environment.allow_assert) { + std.debug.assert(!any_needs_escape); + } + const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; + if (@reduce(.Max, @bitCast(AsciiVectorU16U1, vec > @splat(ascii_u16_vector_size, @as(u16, 127))) | + @bitCast(AsciiVectorU16U1, (vec == vecs[0])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[1])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[2])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[3])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[4]))) == 1) + { + var i: u16 = 0; + lazy: { + while (i < ascii_u16_vector_size) { + switch (remaining[i]) { + '"', '&', '\'', '<', '>' => { + any_needs_escape = true; + break :lazy; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, remaining[i..]); + i += @as(u16, cp.len); + }, + else => { + i += 1; + }, + } + } + } + + if (!any_needs_escape) { + remaining = remaining[i..]; + continue :scan_and_allocate_lazily; + } + + buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + 6); + std.debug.assert(@ptrToInt(remaining.ptr + i) >= @ptrToInt(utf16.ptr)); + const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @ptrToInt(remaining.ptr + i) - @ptrToInt(utf16.ptr)]; + @memcpy(@ptrCast([*]align(2) u8, buf.items.ptr), to_copy.ptr, to_copy.len); + buf.items.len = std.mem.bytesAsSlice(u16, to_copy).len; + + while (i < ascii_u16_vector_size) { + switch (remaining[i]) { + '"', '&', '\'', '<', '>' => |c| { + const result = switch (c) { + '"' => toUTF16Literal("""), + '&' => toUTF16Literal("&"), + '\'' => toUTF16Literal("'"), + '<' => toUTF16Literal("<"), + '>' => toUTF16Literal(">"), + else => unreachable, + }; + + buf.appendSlice(result) catch unreachable; + i += 1; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, remaining[i..]); + + buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; + i += @as(u16, cp.len); + }, + else => |c| { + i += 1; + buf.append(c) catch unreachable; + }, + } + } + + // edgecase: code point width could exceed asdcii_u16_vector_size + remaining = remaining[i..]; + break :scan_and_allocate_lazily; + } + + remaining = remaining[ascii_u16_vector_size..]; + } + + if (any_needs_escape) { + // pass #2: we found something that needed an escape + // but there's still some more text to + // so we'll go ahead and copy the buffer into a new buffer + while (remaining.len >= ascii_u16_vector_size) { + const vec: AsciiU16Vector = remaining[0..ascii_u16_vector_size].*; + if (@reduce(.Max, @bitCast(AsciiVectorU16U1, vec > @splat(ascii_u16_vector_size, @as(u16, 127))) | + @bitCast(AsciiVectorU16U1, (vec == vecs[0])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[1])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[2])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[3])) | + @bitCast(AsciiVectorU16U1, (vec == vecs[4]))) == 1) + { + buf.ensureUnusedCapacity(ascii_u16_vector_size) catch unreachable; + var i: u16 = 0; + while (i < ascii_u16_vector_size) { + switch (remaining[i]) { + '"' => { + buf.appendSlice(toUTF16Literal(""")) catch unreachable; + i += 1; + }, + '&' => { + buf.appendSlice(toUTF16Literal("&")) catch unreachable; + i += 1; + }, + '\'' => { + buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be ''' + i += 1; + }, + '<' => { + buf.appendSlice(toUTF16Literal("<")) catch unreachable; + i += 1; + }, + '>' => { + buf.appendSlice(toUTF16Literal(">")) catch unreachable; + i += 1; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, remaining[i..]); + + buf.appendSlice(remaining[i..][0..@as(usize, cp.len)]) catch unreachable; + i += @as(u16, cp.len); + }, + else => |c| { + buf.append(c) catch unreachable; + i += 1; + }, + } + } + + remaining = remaining[i..]; + continue; + } + + try buf.ensureUnusedCapacity(ascii_u16_vector_size); + buf.items.ptr[buf.items.len .. buf.items.len + ascii_u16_vector_size][0..ascii_u16_vector_size].* = remaining[0..ascii_u16_vector_size].*; + buf.items.len += ascii_u16_vector_size; + remaining = remaining[ascii_u16_vector_size..]; + } + } + } + + var ptr = remaining.ptr; + const end = remaining.ptr + remaining.len; + + if (!any_needs_escape) { + scan_and_allocate_lazily: while (ptr != end) { + switch (ptr[0]) { + '"', '&', '\'', '<', '>' => |c| { + buf = try std.ArrayList(u16).initCapacity(allocator, utf16.len + @as(usize, Scalar.lengths[c])); + std.debug.assert(@ptrToInt(ptr) >= @ptrToInt(utf16.ptr)); + + const to_copy = std.mem.sliceAsBytes(utf16)[0 .. @ptrToInt(ptr) - @ptrToInt(utf16.ptr)]; + + @memcpy( + @ptrCast([*]align(2) u8, buf.items.ptr), + to_copy.ptr, + to_copy.len, + ); + + buf.items.len = std.mem.bytesAsSlice(u16, to_copy).len; any_needs_escape = true; break :scan_and_allocate_lazily; }, - '>' => { - const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); - buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); - @memcpy(buf.items.ptr, latin1.ptr, copy_len); - buf.items.len = copy_len; - buf.appendSlice(">") catch unreachable; - remaining = remaining[1..]; - any_needs_escape = true; - break :scan_and_allocate_lazily; + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, ptr[0..2]); + + buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable; + ptr += @as(u16, cp.len); }, else => { - remaining = remaining[1..]; + ptr += 1; }, } } } - if (remaining.len > 0) { - std.debug.assert(any_needs_escape); - for (remaining) |c| { - switch (c) { - '"' => { - buf.appendSlice(""") catch unreachable; - }, - '&' => { - buf.appendSlice("&") catch unreachable; - }, - '\'' => { - buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' - }, - '<' => { - buf.appendSlice("<") catch unreachable; - }, - '>' => { - buf.appendSlice(">") catch unreachable; - }, - else => { - buf.append(c) catch unreachable; - }, - } + while (ptr != end) { + switch (ptr[0]) { + '"' => { + buf.appendSlice(toUTF16Literal(""")) catch unreachable; + ptr += 1; + }, + '&' => { + buf.appendSlice(toUTF16Literal("&")) catch unreachable; + ptr += 1; + }, + '\'' => { + buf.appendSlice(toUTF16Literal("'")) catch unreachable; // modified from escape-html; used to be ''' + ptr += 1; + }, + '<' => { + buf.appendSlice(toUTF16Literal("<")) catch unreachable; + ptr += 1; + }, + '>' => { + buf.appendSlice(toUTF16Literal(">")) catch unreachable; + ptr += 1; + }, + 128...std.math.maxInt(u16) => { + const cp = utf16Codepoint([]const u16, ptr[0..2]); + + buf.appendSlice(ptr[0..@as(usize, cp.len)]) catch unreachable; + ptr += @as(u16, cp.len); + }, + + else => |c| { + buf.append(c) catch unreachable; + ptr += 1; + }, } } if (!any_needs_escape) { - return latin1; + return utf16; } return buf.toOwnedSlice(); @@ -2483,27 +2784,27 @@ test "firstNonASCII" { test "firstNonASCII16" { @setEvalBranchQuota(99999); - const yes = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); + const yes = std.mem.span(toUTF16Literal("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(true, firstNonASCII16(@TypeOf(yes), yes) == null); { @setEvalBranchQuota(99999); - const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); + const no = std.mem.span(toUTF16Literal("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdoka🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(@as(u32, 50), firstNonASCII16(@TypeOf(no), no).?); } { @setEvalBranchQuota(99999); - const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); + const no = std.mem.span(toUTF16Literal("🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(@as(u32, 0), firstNonASCII16(@TypeOf(no), no).?); } { @setEvalBranchQuota(99999); - const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("a🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); + const no = std.mem.span(toUTF16Literal("a🙂sdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123")); try std.testing.expectEqual(@as(u32, 1), firstNonASCII16(@TypeOf(no), no).?); } { @setEvalBranchQuota(99999); - const no = std.mem.span(std.unicode.utf8ToUtf16LeStringLiteral("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd12312🙂3")); + const no = std.mem.span(toUTF16Literal("aspdokasdpokasdpokasd aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd123123aspdokasdpokasdpokasdaspdokasdpokasdpokasdaspdokasdpokasdpokasd12312🙂3")); try std.testing.expectEqual(@as(u32, 366), firstNonASCII16(@TypeOf(no), no).?); } } @@ -2541,7 +2842,7 @@ pub fn formatUTF16(slice_: []align(1) const u16, writer: anytype) !void { test "print UTF16" { var err = std.io.getStdErr(); - const utf16 = comptime std.unicode.utf8ToUtf16LeStringLiteral("❌ ✅ opkay "); + const utf16 = comptime toUTF16Literal("❌ ✅ opkay "); try formatUTF16(utf16, err.writer()); // std.unicode.fmtUtf16le(utf16le: []const u16) } |