diff options
author | 2022-06-03 18:49:12 -0700 | |
---|---|---|
committer | 2022-06-03 18:49:12 -0700 | |
commit | 9f640ffb51dc216e78af6ea5fa0eb8bc782e446b (patch) | |
tree | 19279f2f1b0d12ec3f2df651807201a76285cfd7 | |
parent | af6859acc27265e5a0cbb3107953547c74de281b (diff) | |
download | bun-9f640ffb51dc216e78af6ea5fa0eb8bc782e446b.tar.gz bun-9f640ffb51dc216e78af6ea5fa0eb8bc782e446b.tar.zst bun-9f640ffb51dc216e78af6ea5fa0eb8bc782e446b.zip |
impl #1
-rw-r--r-- | bench/snippets/escapeHTML.js | 86 | ||||
-rw-r--r-- | integration/bunjs-only-snippets/escapeHTML.test.js | 54 | ||||
-rw-r--r-- | src/javascript/jsc/api/bun.zig | 39 | ||||
-rw-r--r-- | src/string_immutable.zig | 224 |
4 files changed, 403 insertions, 0 deletions
diff --git a/bench/snippets/escapeHTML.js b/bench/snippets/escapeHTML.js new file mode 100644 index 000000000..61bb74c7d --- /dev/null +++ b/bench/snippets/escapeHTML.js @@ -0,0 +1,86 @@ +import { group } from "mitata"; +import { bench, run } from "mitata"; + +var bunEscapeHTML = Bun.escapeHTML; + +const matchHtmlRegExp = /["'&<>]/; + +/** + * Escapes special characters and HTML entities in a given html string. + * + * @param {string} string HTML string to escape for later insertion + * @return {string} + * @public + */ + +function reactEscapeHtml(string) { + const str = "" + string; + const match = matchHtmlRegExp.exec(str); + + if (!match) { + return str; + } + + let escape; + let html = ""; + let index; + let lastIndex = 0; + + for (index = match.index; index < str.length; index++) { + switch (str.charCodeAt(index)) { + case 34: // " + escape = """; + break; + case 38: // & + escape = "&"; + break; + case 39: // ' + escape = "'"; // modified from escape-html; used to be ''' + break; + case 60: // < + escape = "<"; + break; + case 62: // > + escape = ">"; + break; + default: + continue; + } + + if (lastIndex !== index) { + html += str.substring(lastIndex, index); + } + + lastIndex = index + 1; + html += escape; + } + + return lastIndex !== index ? html + str.substring(lastIndex, index) : html; +} + +const long = ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(9000); +const short = "lalala" + "<script>alert(1)</script>" + "lalala"; +const middle = + "lalala".repeat(2000) + "<script>alert(1)</script>" + "lalala".repeat(2000); +const nothing = "lalala".repeat(9999); +group(`long (${long.length})`, () => { + bench("react's escapeHTML", () => reactEscapeHtml(long)); + bench("bun's escapeHTML", () => bunEscapeHTML(long)); +}); + +group(`short (${short.length})`, () => { + bench("react's escapeHTML", () => reactEscapeHtml(short)); + bench("bun's escapeHTML", () => bunEscapeHTML(short)); +}); + +group(`middle (${middle.length})`, () => { + bench("react's escapeHTML", () => reactEscapeHtml(middle)); + bench("bun's escapeHTML", () => bunEscapeHTML(middle)); +}); + +group(`nothing (${nothing.length})`, () => { + bench("react's escapeHTML", () => reactEscapeHtml(nothing)); + bench("bun's escapeHTML", () => bunEscapeHTML(nothing)); +}); + +await run(); diff --git a/integration/bunjs-only-snippets/escapeHTML.test.js b/integration/bunjs-only-snippets/escapeHTML.test.js new file mode 100644 index 000000000..ca0ff5a36 --- /dev/null +++ b/integration/bunjs-only-snippets/escapeHTML.test.js @@ -0,0 +1,54 @@ +import { describe, it, expect } from "bun:test"; +import { gcTick } from "./gc"; + +describe("Bun.escapeHTML", () => { + it("works", () => { + expect(Bun.escapeHTML("<script>alert(1)</script>")).toBe( + "<script>alert(1)</script>" + ); + expect(Bun.escapeHTML("<")).toBe("<"); + expect(Bun.escapeHTML(">")).toBe(">"); + expect(Bun.escapeHTML("&")).toBe("&"); + expect(Bun.escapeHTML("'")).toBe("'"); + expect(Bun.escapeHTML('"')).toBe("""); + expect(Bun.escapeHTML("\n")).toBe("\n"); + expect(Bun.escapeHTML("\r")).toBe("\r"); + expect(Bun.escapeHTML("\t")).toBe("\t"); + expect(Bun.escapeHTML("\f")).toBe("\f"); + expect(Bun.escapeHTML("\v")).toBe("\v"); + expect(Bun.escapeHTML("\b")).toBe("\b"); + expect(Bun.escapeHTML("\u00A0")).toBe("\u00A0"); + + // The matrix of cases we need to test for: + // 1. Works with short strings + // 2. Works with long strings + // 3. Works with latin1 strings + // 4. Works with utf16 strings + // 5. Works when the text to escape is somewhere in the middle + // 6. Works when the text to escape is in the beginning + // 7. Works when the text to escape is in the end + // 8. Returns the same string when there's no need to escape + expect( + Bun.escapeHTML("lalala" + "<script>alert(1)</script>" + "lalala") + ).toBe("lalala<script>alert(1)</script>lalala"); + + expect(Bun.escapeHTML("<script>alert(1)</script>" + "lalala")).toBe( + "<script>alert(1)</script>lalala" + ); + expect(Bun.escapeHTML("lalala" + "<script>alert(1)</script>")).toBe( + "lalala" + "<script>alert(1)</script>" + ); + + expect( + Bun.escapeHTML( + ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(900) + ) + ).toBe("lalala<script>alert(1)</script>lalala".repeat(900)); + expect( + Bun.escapeHTML(("<script>alert(1)</script>" + "lalala").repeat(900)) + ).toBe("<script>alert(1)</script>lalala".repeat(900)); + expect( + Bun.escapeHTML(("lalala" + "<script>alert(1)</script>").repeat(900)) + ).toBe(("lalala" + "<script>alert(1)</script>").repeat(900)); + }); +}); diff --git a/src/javascript/jsc/api/bun.zig b/src/javascript/jsc/api/bun.zig index 8bbcccfb5..1ee9cb96d 100644 --- a/src/javascript/jsc/api/bun.zig +++ b/src/javascript/jsc/api/bun.zig @@ -1150,6 +1150,9 @@ pub const Class = NewClass( .inflateSync = .{ .rfn = JSC.wrapWithHasContainer(JSZlib, "inflateSync", false, false, true), }, + .escapeHTML = .{ + .rfn = Bun.escapeHTML, + }, }, .{ .main = .{ @@ -1612,6 +1615,42 @@ pub fn serve( unreachable; } +pub fn escapeHTML( + _: void, + ctx: js.JSContextRef, + _: js.JSObjectRef, + _: js.JSObjectRef, + arguments: []const js.JSValueRef, + exception: js.ExceptionRef, +) js.JSValueRef { + if (arguments.len < 1) { + return ZigString.init("").toValue(ctx).asObjectRef(); + } + + const input_value = arguments[0].?.value(); + const zig_str = input_value.getZigString(ctx); + if (zig_str.is16Bit()) { + return input_value.asObjectRef(); + } else { + var input_slice = zig_str.slice(); + var escaped_html = strings.escapeHTMLForLatin1Input(ctx.bunVM().allocator, input_slice) catch { + JSC.JSError(undefined, "Out of memory", .{}, ctx, exception); + return null; + }; + + if (escaped_html.ptr == input_slice.ptr and escaped_html.len == input_slice.len) { + return input_value.asObjectRef(); + } + + if (input_slice.len == 1) { + // single character escaped strings are statically allocated + return ZigString.init(escaped_html).toValue(ctx).asObjectRef(); + } + + return ZigString.init(escaped_html).toExternalValue(ctx).asObjectRef(); + } +} + pub fn allocUnsafe( _: void, ctx: js.JSContextRef, diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 094d63f91..367e6300d 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -1316,6 +1316,230 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize { return count; } +pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) ![]const u8 { + switch (latin1.len) { + 0 => return "", + 1 => return switch (latin1[0]) { + '"' => """, + '&' => "&", + '\'' => "'", + '<' => "<", + '>' => ">", + else => latin1, + }, + else => { + var remaining = latin1; + + const vec_chars = "\"&'<>"; + const vecs: [vec_chars.len]AsciiVector = comptime brk: { + var _vecs: [vec_chars.len]AsciiVector = undefined; + for (vec_chars) |c, i| { + _vecs[i] = @splat(ascii_vector_size, c); + } + break :brk _vecs; + }; + + var buf: std.ArrayList(u8) = undefined; + var any_needs_escape = false; + + if (comptime Environment.isAarch64 or Environment.isX64) { + + // pass #1: scan for any characters that need escaping + // assume most strings won't need any escaping, so don't actually allocate the buffer + scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) { + if (comptime Environment.allow_assert) { + std.debug.assert(!any_needs_escape); + } + + const vec: AsciiVector = remaining[0..ascii_vector_size].*; + if (@reduce( + .Or, + @bitCast(AsciiVectorU1, (vec == vecs[0])) | + @bitCast(AsciiVectorU1, (vec == vecs[1])) | + @bitCast(AsciiVectorU1, (vec == vecs[2])) | + @bitCast(AsciiVectorU1, (vec == vecs[3])) | + @bitCast(AsciiVectorU1, (vec == vecs[4])), + ) == 1) { + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); + @memcpy(buf.items.ptr, latin1.ptr, copy_len); + buf.items.len = copy_len; + any_needs_escape = true; + comptime var i: usize = 0; + inline while (i < ascii_vector_size) : (i += 1) { + switch (vec[i]) { + '"' => { + buf.appendSlice(""") catch unreachable; + }, + '&' => { + buf.appendSlice("&") catch unreachable; + }, + '\'' => { + buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + }, + '<' => { + buf.appendSlice("<") catch unreachable; + }, + '>' => { + buf.appendSlice(">") catch unreachable; + }, + else => |c| { + buf.appendAssumeCapacity(c); + }, + } + } + remaining = remaining[ascii_vector_size..]; + break :scan_and_allocate_lazily; + } + + remaining = remaining[ascii_vector_size..]; + } + + if (any_needs_escape) { + // pass #2: we found something that needed an escape + // so we'll go ahead and copy the buffer into a new buffer + while (remaining.len >= ascii_vector_size) { + const vec: AsciiVector = remaining[0..ascii_vector_size].*; + if (@reduce( + .Or, + @bitCast(AsciiVectorU1, (vec == vecs[0])) | + @bitCast(AsciiVectorU1, (vec == vecs[1])) | + @bitCast(AsciiVectorU1, (vec == vecs[2])) | + @bitCast(AsciiVectorU1, (vec == vecs[3])) | + @bitCast(AsciiVectorU1, (vec == vecs[4])), + ) == 1) { + buf.ensureUnusedCapacity(ascii_vector_size) catch unreachable; + comptime var i: usize = 0; + inline while (i < ascii_vector_size) : (i += 1) { + switch (vec[i]) { + '"' => { + buf.appendSlice(""") catch unreachable; + }, + '&' => { + buf.appendSlice("&") catch unreachable; + }, + '\'' => { + buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + }, + '<' => { + buf.appendSlice("<") catch unreachable; + }, + '>' => { + buf.appendSlice(">") catch unreachable; + }, + else => |c| { + buf.append(c) catch unreachable; + }, + } + } + + remaining = remaining[ascii_vector_size..]; + continue; + } + + try buf.ensureUnusedCapacity(ascii_vector_size); + buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*; + buf.items.len += ascii_vector_size; + remaining = remaining[ascii_vector_size..]; + } + } + } + + if (!any_needs_escape) { + scan_and_allocate_lazily: while (remaining.len > 0) { + switch (remaining[0]) { + '"' => { + const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + @memcpy(buf.items.ptr, latin1.ptr, copy_len); + buf.items.len = copy_len; + buf.appendSlice(""") catch unreachable; + remaining = remaining[1..]; + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + '&' => { + const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + @memcpy(buf.items.ptr, latin1.ptr, copy_len); + buf.items.len = copy_len; + buf.appendSlice("&") catch unreachable; + remaining = remaining[1..]; + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + '\'' => { + const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + @memcpy(buf.items.ptr, latin1.ptr, copy_len); + buf.items.len = copy_len; + buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + remaining = remaining[1..]; + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + '<' => { + const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + @memcpy(buf.items.ptr, latin1.ptr, copy_len); + buf.items.len = copy_len; + buf.appendSlice("<") catch unreachable; + remaining = remaining[1..]; + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + '>' => { + const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr); + buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6); + @memcpy(buf.items.ptr, latin1.ptr, copy_len); + buf.items.len = copy_len; + buf.appendSlice(">") catch unreachable; + remaining = remaining[1..]; + any_needs_escape = true; + break :scan_and_allocate_lazily; + }, + else => { + remaining = remaining[1..]; + }, + } + } + } + + if (remaining.len > 0) { + std.debug.assert(any_needs_escape); + for (remaining) |c| { + switch (c) { + '"' => { + buf.appendSlice(""") catch unreachable; + }, + '&' => { + buf.appendSlice("&") catch unreachable; + }, + '\'' => { + buf.appendSlice("'") catch unreachable; // modified from escape-html; used to be ''' + }, + '<' => { + buf.appendSlice("<") catch unreachable; + }, + '>' => { + buf.appendSlice(">") catch unreachable; + }, + else => { + buf.append(c) catch unreachable; + }, + } + } + } + + if (any_needs_escape) { + return buf.toOwnedSlice(); + } else { + return latin1; + } + }, + } +} + test "copyLatin1IntoUTF8" { var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!"; var output = std.mem.zeroes([500]u8); |