aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com> 2022-06-03 18:49:12 -0700
committerGravatar Jarred Sumner <709451+Jarred-Sumner@users.noreply.github.com> 2022-06-03 18:49:12 -0700
commit9f640ffb51dc216e78af6ea5fa0eb8bc782e446b (patch)
tree19279f2f1b0d12ec3f2df651807201a76285cfd7
parentaf6859acc27265e5a0cbb3107953547c74de281b (diff)
downloadbun-9f640ffb51dc216e78af6ea5fa0eb8bc782e446b.tar.gz
bun-9f640ffb51dc216e78af6ea5fa0eb8bc782e446b.tar.zst
bun-9f640ffb51dc216e78af6ea5fa0eb8bc782e446b.zip
impl #1
-rw-r--r--bench/snippets/escapeHTML.js86
-rw-r--r--integration/bunjs-only-snippets/escapeHTML.test.js54
-rw-r--r--src/javascript/jsc/api/bun.zig39
-rw-r--r--src/string_immutable.zig224
4 files changed, 403 insertions, 0 deletions
diff --git a/bench/snippets/escapeHTML.js b/bench/snippets/escapeHTML.js
new file mode 100644
index 000000000..61bb74c7d
--- /dev/null
+++ b/bench/snippets/escapeHTML.js
@@ -0,0 +1,86 @@
+import { group } from "mitata";
+import { bench, run } from "mitata";
+
+var bunEscapeHTML = Bun.escapeHTML;
+
+const matchHtmlRegExp = /["'&<>]/;
+
+/**
+ * Escapes special characters and HTML entities in a given html string.
+ *
+ * @param {string} string HTML string to escape for later insertion
+ * @return {string}
+ * @public
+ */
+
+function reactEscapeHtml(string) {
+ const str = "" + string;
+ const match = matchHtmlRegExp.exec(str);
+
+ if (!match) {
+ return str;
+ }
+
+ let escape;
+ let html = "";
+ let index;
+ let lastIndex = 0;
+
+ for (index = match.index; index < str.length; index++) {
+ switch (str.charCodeAt(index)) {
+ case 34: // "
+ escape = "&quot;";
+ break;
+ case 38: // &
+ escape = "&amp;";
+ break;
+ case 39: // '
+ escape = "&#x27;"; // modified from escape-html; used to be '&#39'
+ break;
+ case 60: // <
+ escape = "&lt;";
+ break;
+ case 62: // >
+ escape = "&gt;";
+ break;
+ default:
+ continue;
+ }
+
+ if (lastIndex !== index) {
+ html += str.substring(lastIndex, index);
+ }
+
+ lastIndex = index + 1;
+ html += escape;
+ }
+
+ return lastIndex !== index ? html + str.substring(lastIndex, index) : html;
+}
+
+const long = ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(9000);
+const short = "lalala" + "<script>alert(1)</script>" + "lalala";
+const middle =
+ "lalala".repeat(2000) + "<script>alert(1)</script>" + "lalala".repeat(2000);
+const nothing = "lalala".repeat(9999);
+group(`long (${long.length})`, () => {
+ bench("react's escapeHTML", () => reactEscapeHtml(long));
+ bench("bun's escapeHTML", () => bunEscapeHTML(long));
+});
+
+group(`short (${short.length})`, () => {
+ bench("react's escapeHTML", () => reactEscapeHtml(short));
+ bench("bun's escapeHTML", () => bunEscapeHTML(short));
+});
+
+group(`middle (${middle.length})`, () => {
+ bench("react's escapeHTML", () => reactEscapeHtml(middle));
+ bench("bun's escapeHTML", () => bunEscapeHTML(middle));
+});
+
+group(`nothing (${nothing.length})`, () => {
+ bench("react's escapeHTML", () => reactEscapeHtml(nothing));
+ bench("bun's escapeHTML", () => bunEscapeHTML(nothing));
+});
+
+await run();
diff --git a/integration/bunjs-only-snippets/escapeHTML.test.js b/integration/bunjs-only-snippets/escapeHTML.test.js
new file mode 100644
index 000000000..ca0ff5a36
--- /dev/null
+++ b/integration/bunjs-only-snippets/escapeHTML.test.js
@@ -0,0 +1,54 @@
+import { describe, it, expect } from "bun:test";
+import { gcTick } from "./gc";
+
+describe("Bun.escapeHTML", () => {
+ it("works", () => {
+ expect(Bun.escapeHTML("<script>alert(1)</script>")).toBe(
+ "&lt;script&gt;alert(1)&lt;/script&gt;"
+ );
+ expect(Bun.escapeHTML("<")).toBe("&lt;");
+ expect(Bun.escapeHTML(">")).toBe("&gt;");
+ expect(Bun.escapeHTML("&")).toBe("&amp;");
+ expect(Bun.escapeHTML("'")).toBe("&#x27;");
+ expect(Bun.escapeHTML('"')).toBe("&quot;");
+ expect(Bun.escapeHTML("\n")).toBe("\n");
+ expect(Bun.escapeHTML("\r")).toBe("\r");
+ expect(Bun.escapeHTML("\t")).toBe("\t");
+ expect(Bun.escapeHTML("\f")).toBe("\f");
+ expect(Bun.escapeHTML("\v")).toBe("\v");
+ expect(Bun.escapeHTML("\b")).toBe("\b");
+ expect(Bun.escapeHTML("\u00A0")).toBe("\u00A0");
+
+ // The matrix of cases we need to test for:
+ // 1. Works with short strings
+ // 2. Works with long strings
+ // 3. Works with latin1 strings
+ // 4. Works with utf16 strings
+ // 5. Works when the text to escape is somewhere in the middle
+ // 6. Works when the text to escape is in the beginning
+ // 7. Works when the text to escape is in the end
+ // 8. Returns the same string when there's no need to escape
+ expect(
+ Bun.escapeHTML("lalala" + "<script>alert(1)</script>" + "lalala")
+ ).toBe("lalala&lt;script&gt;alert(1)&lt;/script&gt;lalala");
+
+ expect(Bun.escapeHTML("<script>alert(1)</script>" + "lalala")).toBe(
+ "&lt;script&gt;alert(1)&lt;/script&gt;lalala"
+ );
+ expect(Bun.escapeHTML("lalala" + "<script>alert(1)</script>")).toBe(
+ "lalala" + "&lt;script&gt;alert(1)&lt;/script&gt;"
+ );
+
+ expect(
+ Bun.escapeHTML(
+ ("lalala" + "<script>alert(1)</script>" + "lalala").repeat(900)
+ )
+ ).toBe("lalala&lt;script&gt;alert(1)&lt;/script&gt;lalala".repeat(900));
+ expect(
+ Bun.escapeHTML(("<script>alert(1)</script>" + "lalala").repeat(900))
+ ).toBe("&lt;script&gt;alert(1)&lt;/script&gt;lalala".repeat(900));
+ expect(
+ Bun.escapeHTML(("lalala" + "<script>alert(1)</script>").repeat(900))
+ ).toBe(("lalala" + "&lt;script&gt;alert(1)&lt;/script&gt;").repeat(900));
+ });
+});
diff --git a/src/javascript/jsc/api/bun.zig b/src/javascript/jsc/api/bun.zig
index 8bbcccfb5..1ee9cb96d 100644
--- a/src/javascript/jsc/api/bun.zig
+++ b/src/javascript/jsc/api/bun.zig
@@ -1150,6 +1150,9 @@ pub const Class = NewClass(
.inflateSync = .{
.rfn = JSC.wrapWithHasContainer(JSZlib, "inflateSync", false, false, true),
},
+ .escapeHTML = .{
+ .rfn = Bun.escapeHTML,
+ },
},
.{
.main = .{
@@ -1612,6 +1615,42 @@ pub fn serve(
unreachable;
}
+pub fn escapeHTML(
+ _: void,
+ ctx: js.JSContextRef,
+ _: js.JSObjectRef,
+ _: js.JSObjectRef,
+ arguments: []const js.JSValueRef,
+ exception: js.ExceptionRef,
+) js.JSValueRef {
+ if (arguments.len < 1) {
+ return ZigString.init("").toValue(ctx).asObjectRef();
+ }
+
+ const input_value = arguments[0].?.value();
+ const zig_str = input_value.getZigString(ctx);
+ if (zig_str.is16Bit()) {
+ return input_value.asObjectRef();
+ } else {
+ var input_slice = zig_str.slice();
+ var escaped_html = strings.escapeHTMLForLatin1Input(ctx.bunVM().allocator, input_slice) catch {
+ JSC.JSError(undefined, "Out of memory", .{}, ctx, exception);
+ return null;
+ };
+
+ if (escaped_html.ptr == input_slice.ptr and escaped_html.len == input_slice.len) {
+ return input_value.asObjectRef();
+ }
+
+ if (input_slice.len == 1) {
+ // single character escaped strings are statically allocated
+ return ZigString.init(escaped_html).toValue(ctx).asObjectRef();
+ }
+
+ return ZigString.init(escaped_html).toExternalValue(ctx).asObjectRef();
+ }
+}
+
pub fn allocUnsafe(
_: void,
ctx: js.JSContextRef,
diff --git a/src/string_immutable.zig b/src/string_immutable.zig
index 094d63f91..367e6300d 100644
--- a/src/string_immutable.zig
+++ b/src/string_immutable.zig
@@ -1316,6 +1316,230 @@ pub fn elementLengthLatin1IntoUTF16(comptime Type: type, latin1_: Type) usize {
return count;
}
+pub fn escapeHTMLForLatin1Input(allocator: std.mem.Allocator, latin1: []const u8) ![]const u8 {
+ switch (latin1.len) {
+ 0 => return "",
+ 1 => return switch (latin1[0]) {
+ '"' => "&quot;",
+ '&' => "&amp;",
+ '\'' => "&#x27;",
+ '<' => "&lt;",
+ '>' => "&gt;",
+ else => latin1,
+ },
+ else => {
+ var remaining = latin1;
+
+ const vec_chars = "\"&'<>";
+ const vecs: [vec_chars.len]AsciiVector = comptime brk: {
+ var _vecs: [vec_chars.len]AsciiVector = undefined;
+ for (vec_chars) |c, i| {
+ _vecs[i] = @splat(ascii_vector_size, c);
+ }
+ break :brk _vecs;
+ };
+
+ var buf: std.ArrayList(u8) = undefined;
+ var any_needs_escape = false;
+
+ if (comptime Environment.isAarch64 or Environment.isX64) {
+
+ // pass #1: scan for any characters that need escaping
+ // assume most strings won't need any escaping, so don't actually allocate the buffer
+ scan_and_allocate_lazily: while (remaining.len >= ascii_vector_size) {
+ if (comptime Environment.allow_assert) {
+ std.debug.assert(!any_needs_escape);
+ }
+
+ const vec: AsciiVector = remaining[0..ascii_vector_size].*;
+ if (@reduce(
+ .Or,
+ @bitCast(AsciiVectorU1, (vec == vecs[0])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[1])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[2])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[3])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[4])),
+ ) == 1) {
+ buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+ const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+ @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+ buf.items.len = copy_len;
+ any_needs_escape = true;
+ comptime var i: usize = 0;
+ inline while (i < ascii_vector_size) : (i += 1) {
+ switch (vec[i]) {
+ '"' => {
+ buf.appendSlice("&quot;") catch unreachable;
+ },
+ '&' => {
+ buf.appendSlice("&amp;") catch unreachable;
+ },
+ '\'' => {
+ buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+ },
+ '<' => {
+ buf.appendSlice("&lt;") catch unreachable;
+ },
+ '>' => {
+ buf.appendSlice("&gt;") catch unreachable;
+ },
+ else => |c| {
+ buf.appendAssumeCapacity(c);
+ },
+ }
+ }
+ remaining = remaining[ascii_vector_size..];
+ break :scan_and_allocate_lazily;
+ }
+
+ remaining = remaining[ascii_vector_size..];
+ }
+
+ if (any_needs_escape) {
+ // pass #2: we found something that needed an escape
+ // so we'll go ahead and copy the buffer into a new buffer
+ while (remaining.len >= ascii_vector_size) {
+ const vec: AsciiVector = remaining[0..ascii_vector_size].*;
+ if (@reduce(
+ .Or,
+ @bitCast(AsciiVectorU1, (vec == vecs[0])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[1])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[2])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[3])) |
+ @bitCast(AsciiVectorU1, (vec == vecs[4])),
+ ) == 1) {
+ buf.ensureUnusedCapacity(ascii_vector_size) catch unreachable;
+ comptime var i: usize = 0;
+ inline while (i < ascii_vector_size) : (i += 1) {
+ switch (vec[i]) {
+ '"' => {
+ buf.appendSlice("&quot;") catch unreachable;
+ },
+ '&' => {
+ buf.appendSlice("&amp;") catch unreachable;
+ },
+ '\'' => {
+ buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+ },
+ '<' => {
+ buf.appendSlice("&lt;") catch unreachable;
+ },
+ '>' => {
+ buf.appendSlice("&gt;") catch unreachable;
+ },
+ else => |c| {
+ buf.append(c) catch unreachable;
+ },
+ }
+ }
+
+ remaining = remaining[ascii_vector_size..];
+ continue;
+ }
+
+ try buf.ensureUnusedCapacity(ascii_vector_size);
+ buf.items.ptr[buf.items.len .. buf.items.len + ascii_vector_size][0..ascii_vector_size].* = remaining[0..ascii_vector_size].*;
+ buf.items.len += ascii_vector_size;
+ remaining = remaining[ascii_vector_size..];
+ }
+ }
+ }
+
+ if (!any_needs_escape) {
+ scan_and_allocate_lazily: while (remaining.len > 0) {
+ switch (remaining[0]) {
+ '"' => {
+ const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+ buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+ @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+ buf.items.len = copy_len;
+ buf.appendSlice("&quot;") catch unreachable;
+ remaining = remaining[1..];
+ any_needs_escape = true;
+ break :scan_and_allocate_lazily;
+ },
+ '&' => {
+ const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+ buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+ @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+ buf.items.len = copy_len;
+ buf.appendSlice("&amp;") catch unreachable;
+ remaining = remaining[1..];
+ any_needs_escape = true;
+ break :scan_and_allocate_lazily;
+ },
+ '\'' => {
+ const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+ buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+ @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+ buf.items.len = copy_len;
+ buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+ remaining = remaining[1..];
+ any_needs_escape = true;
+ break :scan_and_allocate_lazily;
+ },
+ '<' => {
+ const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+ buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+ @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+ buf.items.len = copy_len;
+ buf.appendSlice("&lt;") catch unreachable;
+ remaining = remaining[1..];
+ any_needs_escape = true;
+ break :scan_and_allocate_lazily;
+ },
+ '>' => {
+ const copy_len = @ptrToInt(remaining.ptr) - @ptrToInt(latin1.ptr);
+ buf = try std.ArrayList(u8).initCapacity(allocator, latin1.len + 6);
+ @memcpy(buf.items.ptr, latin1.ptr, copy_len);
+ buf.items.len = copy_len;
+ buf.appendSlice("&gt;") catch unreachable;
+ remaining = remaining[1..];
+ any_needs_escape = true;
+ break :scan_and_allocate_lazily;
+ },
+ else => {
+ remaining = remaining[1..];
+ },
+ }
+ }
+ }
+
+ if (remaining.len > 0) {
+ std.debug.assert(any_needs_escape);
+ for (remaining) |c| {
+ switch (c) {
+ '"' => {
+ buf.appendSlice("&quot;") catch unreachable;
+ },
+ '&' => {
+ buf.appendSlice("&amp;") catch unreachable;
+ },
+ '\'' => {
+ buf.appendSlice("&#x27;") catch unreachable; // modified from escape-html; used to be '&#39'
+ },
+ '<' => {
+ buf.appendSlice("&lt;") catch unreachable;
+ },
+ '>' => {
+ buf.appendSlice("&gt;") catch unreachable;
+ },
+ else => {
+ buf.append(c) catch unreachable;
+ },
+ }
+ }
+ }
+
+ if (any_needs_escape) {
+ return buf.toOwnedSlice();
+ } else {
+ return latin1;
+ }
+ },
+ }
+}
+
test "copyLatin1IntoUTF8" {
var input: string = "hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!hello world!";
var output = std.mem.zeroes([500]u8);