aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-09-13 01:26:18 -0700
committerGravatar GitHub <noreply@github.com> 2023-09-13 01:26:18 -0700
commit32664df254be225dd195fcaf46994f0c550f9d22 (patch)
tree3636f2c68bf3e8926eae3c83ee5930968023a02e
parent15f7bacb8bd57615475e1614f5f93b23810e63b1 (diff)
downloadbun-32664df254be225dd195fcaf46994f0c550f9d22.tar.gz
bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.zst
bun-32664df254be225dd195fcaf46994f0c550f9d22.zip
decode regex if needed (#5167)
* decode regex if non-ascii * make it comptime * add test * use `bun.BabyList(u16)`
-rw-r--r--src/js_ast.zig7
-rw-r--r--src/js_lexer.zig61
-rw-r--r--src/js_parser.zig32
-rw-r--r--src/js_printer.zig188
-rw-r--r--test/transpiler/transpiler.test.js5
5 files changed, 167 insertions, 126 deletions
diff --git a/src/js_ast.zig b/src/js_ast.zig
index 7811541f4..d4a3b1d92 100644
--- a/src/js_ast.zig
+++ b/src/js_ast.zig
@@ -2538,7 +2538,10 @@ pub const E = struct {
};
pub const RegExp = struct {
- value: string,
+ data: union(enum) {
+ raw: string,
+ decoded: bun.BabyList(u16),
+ },
// This exists for JavaScript bindings
// The RegExp constructor expects flags as a second argument.
@@ -2548,7 +2551,7 @@ pub const E = struct {
// ^
flags_offset: ?u16 = null,
- pub var empty = RegExp{ .value = "" };
+ pub var empty = RegExp{ .data = .{ .raw = "" } };
pub fn pattern(this: RegExp) string {
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index 1417ea01e..01852bb65 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -295,7 +295,7 @@ fn NewLexer_(
this.comments_to_preserve_before.clearAndFree();
}
- fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
+ pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
var buf = buf_.*;
defer buf_.* = buf;
if (comptime is_json) lexer.is_ascii_only = false;
@@ -2075,9 +2075,11 @@ fn NewLexer_(
if (comptime is_json) unreachable;
}
- pub fn scanRegExp(lexer: *LexerType) !void {
+ // returns true of the regex contents need to be decoded
+ pub fn scanRegExp(lexer: *LexerType) !bool {
lexer.assertNotJSON();
lexer.regex_flags_start = null;
+ var decode = lexer.code_point >= 0x80;
while (true) {
switch (lexer.code_point) {
'/' => {
@@ -2121,20 +2123,48 @@ fn NewLexer_(
},
}
}
- return;
+
+ return decode;
},
'[' => {
lexer.step();
+ if (lexer.code_point >= 0x80) decode = true;
while (lexer.code_point != ']') {
- try lexer.scanRegExpValidateAndStep();
+ try lexer.scanRegExpValidateAndStep(&decode);
}
lexer.step();
+ if (lexer.code_point >= 0x80) decode = true;
},
else => {
- try lexer.scanRegExpValidateAndStep();
+ try lexer.scanRegExpValidateAndStep(&decode);
},
}
}
+
+ return decode;
+ }
+
+ fn scanRegExpValidateAndStep(lexer: *LexerType, decode: *bool) !void {
+ lexer.assertNotJSON();
+
+ if (lexer.code_point == '\\') {
+ lexer.step();
+ if (lexer.code_point >= 0x80) decode.* = true;
+ }
+
+ switch (lexer.code_point) {
+ '\r', '\n', 0x2028, 0x2029 => {
+ // Newlines aren't allowed in regular expressions
+ try lexer.syntaxError();
+ },
+ -1 => { // EOF
+ try lexer.syntaxError();
+ },
+ else => {
+ lexer.step();
+ if (lexer.code_point >= 0x80) decode.* = true;
+ },
+ }
}
// TODO: use wtf-8 encoding.
@@ -2592,27 +2622,6 @@ fn NewLexer_(
try lexer.nextInsideJSXElement();
}
- fn scanRegExpValidateAndStep(lexer: *LexerType) !void {
- lexer.assertNotJSON();
-
- if (lexer.code_point == '\\') {
- lexer.step();
- }
-
- switch (lexer.code_point) {
- '\r', '\n', 0x2028, 0x2029 => {
- // Newlines aren't allowed in regular expressions
- try lexer.syntaxError();
- },
- -1 => { // EOF
- try lexer.syntaxError();
- },
- else => {
- lexer.step();
- },
- }
- }
-
pub fn rescanCloseBraceAsTemplateToken(lexer: *LexerType) !void {
lexer.assertNotJSON();
diff --git a/src/js_parser.zig b/src/js_parser.zig
index a61657128..2e6b9e336 100644
--- a/src/js_parser.zig
+++ b/src/js_parser.zig
@@ -13095,13 +13095,39 @@ fn NewParser_(
return p.newExpr(E.BigInt{ .value = value }, loc);
},
.t_slash, .t_slash_equals => {
- try p.lexer.scanRegExp();
+ const needs_decode = try p.lexer.scanRegExp();
// always set regex_flags_start to null to make sure we don't accidentally use the wrong value later
defer p.lexer.regex_flags_start = null;
- const value = p.lexer.raw();
+
+ const raw = p.lexer.raw();
+
+ if (!needs_decode) {
+ try p.lexer.next();
+ return p.newExpr(
+ E.RegExp{
+ .data = .{
+ .raw = raw,
+ },
+ .flags_offset = p.lexer.regex_flags_start,
+ },
+ loc,
+ );
+ }
+
+ var buf = std.ArrayList(u16).initCapacity(p.allocator, raw.len) catch unreachable;
+ try p.lexer.decodeEscapeSequences(p.lexer.start, raw, @TypeOf(buf), &buf);
+
try p.lexer.next();
- return p.newExpr(E.RegExp{ .value = value, .flags_offset = p.lexer.regex_flags_start }, loc);
+ return p.newExpr(
+ E.RegExp{
+ .data = .{
+ .decoded = bun.BabyList(u16).init(buf.items),
+ },
+ .flags_offset = p.lexer.regex_flags_start,
+ },
+ loc,
+ );
},
.t_void => {
try p.lexer.next();
diff --git a/src/js_printer.zig b/src/js_printer.zig
index 57ef580b6..3caa5f11e 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -1437,7 +1437,7 @@ fn NewPrinter(
) catch unreachable;
}
- pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void {
+ fn printUTF16(e: *Printer, text: []const u16, comptime quoted: bool, quote: u8) void {
var i: usize = 0;
const n: usize = text.len;
@@ -1471,26 +1471,42 @@ fn NewPrinter(
e.print("\\x07");
},
0x08 => {
- if (quote == '`')
- e.print(0x08)
- else
+ if (comptime quoted) {
+ if (quote == '`')
+ e.print(0x08)
+ else
+ e.print("\\b");
+ } else {
e.print("\\b");
+ }
},
0x0C => {
- if (quote == '`')
- e.print(0x000C)
- else
+ if (comptime quoted) {
+ if (quote == '`')
+ e.print(0x000C)
+ else
+ e.print("\\f");
+ } else {
e.print("\\f");
+ }
},
'\t' => {
- if (quote == '`')
- e.print("\t")
- else
+ if (comptime quoted) {
+ if (quote == '`')
+ e.print("\t")
+ else
+ e.print("\\t");
+ } else {
e.print("\\t");
+ }
},
'\n' => {
- if (quote == '`') {
- e.print('\n');
+ if (comptime quoted) {
+ if (quote == '`') {
+ e.print('\n');
+ } else {
+ e.print("\\n");
+ }
} else {
e.print("\\n");
}
@@ -1501,8 +1517,12 @@ fn NewPrinter(
},
// \v
std.ascii.control_code.vt => {
- if (quote == '`') {
- e.print(std.ascii.control_code.vt);
+ if (comptime quoted) {
+ if (quote == '`') {
+ e.print(std.ascii.control_code.vt);
+ } else {
+ e.print("\\v");
+ }
} else {
e.print("\\v");
}
@@ -1513,29 +1533,37 @@ fn NewPrinter(
},
'\'' => {
- if (quote == '\'') {
- e.print('\\');
+ if (comptime quoted) {
+ if (quote == '\'') {
+ e.print('\\');
+ }
}
e.print("'");
},
'"' => {
- if (quote == '"') {
- e.print('\\');
+ if (comptime quoted) {
+ if (quote == '"') {
+ e.print('\\');
+ }
}
e.print("\"");
},
'`' => {
- if (quote == '`') {
- e.print('\\');
+ if (comptime quoted) {
+ if (quote == '`') {
+ e.print('\\');
+ }
}
e.print("`");
},
'$' => {
- if (quote == '`' and i < n and text[i] == '{') {
- e.print('\\');
+ if (comptime quoted) {
+ if (quote == '`' and i < n and text[i] == '{') {
+ e.print('\\');
+ }
}
e.print('$');
@@ -1559,32 +1587,34 @@ fn NewPrinter(
// this only applies to template literal strings
// but we print a template literal if there is a \n or a \r
// which is often if the string is long and UTF-16
- if (quote == '`') {
- const remain = text[i..];
- if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
- remain[0] != '$' and
- remain[0] != '\\' and
- remain[0] != '`')
- {
- if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
- if (count_ == 0)
- unreachable; // conditional above checks this
-
- const len = count_ - 1;
- i += len;
- var ptr = e.writer.reserve(len) catch unreachable;
- var to_copy = ptr[0..len];
-
- strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
- e.writer.advance(len);
- continue :outer;
- } else {
- const count = @as(u32, @truncate(remain.len));
- var ptr = e.writer.reserve(count) catch unreachable;
- var to_copy = ptr[0..count];
- strings.copyU16IntoU8(to_copy, []const u16, remain);
- e.writer.advance(count);
- i += count;
+ if (comptime quoted) {
+ if (quote == '`') {
+ const remain = text[i..];
+ if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
+ remain[0] != '$' and
+ remain[0] != '\\' and
+ remain[0] != '`')
+ {
+ if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
+ if (count_ == 0)
+ unreachable; // conditional above checks this
+
+ const len = count_ - 1;
+ i += len;
+ var ptr = e.writer.reserve(len) catch unreachable;
+ var to_copy = ptr[0..len];
+
+ strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
+ e.writer.advance(len);
+ continue :outer;
+ } else {
+ const count = @as(u32, @truncate(remain.len));
+ var ptr = e.writer.reserve(count) catch unreachable;
+ var to_copy = ptr[0..count];
+ strings.copyU16IntoU8(to_copy, []const u16, remain);
+ e.writer.advance(count);
+ i += count;
+ }
}
}
}
@@ -1664,6 +1694,14 @@ fn NewPrinter(
}
}
+ pub fn printUnquotedUTF16(p: *Printer, text: []const u16) void {
+ p.printUTF16(text, false, 0);
+ }
+
+ pub fn printQuotedUTF16(p: *Printer, text: []const u16, quote: u8) void {
+ p.printUTF16(text, true, quote);
+ }
+
pub fn isUnboundEvalIdentifier(p: *Printer, value: Expr) bool {
switch (value.data) {
.e_identifier => |ident| {
@@ -3149,53 +3187,13 @@ fn NewPrinter(
p.print(" ");
}
- if (comptime is_bun_platform) {
- // Translate any non-ASCII to unicode escape sequences
- var ascii_start: usize = 0;
- var is_ascii = false;
- var iter = CodepointIterator.init(e.value);
- var cursor = CodepointIterator.Cursor{};
- while (iter.next(&cursor)) {
- switch (cursor.c) {
- first_ascii...last_ascii => {
- if (!is_ascii) {
- ascii_start = cursor.i;
- is_ascii = true;
- }
- },
- else => {
- if (is_ascii) {
- p.print(e.value[ascii_start..cursor.i]);
- is_ascii = false;
- }
-
- switch (cursor.c) {
- 0...0xFFFF => {
- p.print([_]u8{
- '\\',
- 'u',
- hex_chars[cursor.c >> 12],
- hex_chars[(cursor.c >> 8) & 15],
- hex_chars[(cursor.c >> 4) & 15],
- hex_chars[cursor.c & 15],
- });
- },
- else => {
- p.print("\\u{");
- std.fmt.formatInt(cursor.c, 16, .lower, .{}, p) catch unreachable;
- p.print("}");
- },
- }
- },
- }
- }
-
- if (is_ascii) {
- p.print(e.value[ascii_start..]);
- }
- } else {
- // UTF8 sequence is fine
- p.print(e.value);
+ switch (e.data) {
+ .raw => |raw| {
+ p.print(raw);
+ },
+ .decoded => |decoded| {
+ p.printUnquotedUTF16(decoded.slice());
+ },
}
// Need a space before the next identifier to avoid it turning into flags
diff --git a/test/transpiler/transpiler.test.js b/test/transpiler/transpiler.test.js
index a6c2dcf73..c80a0670a 100644
--- a/test/transpiler/transpiler.test.js
+++ b/test/transpiler/transpiler.test.js
@@ -1973,6 +1973,11 @@ console.log(resolve.length)
expectParseError("/x/msuygig", 'Duplicate flag "g" in regular expression');
});
+ it("non-ascii regexp literals", () => {
+ var str = "🔴11 54 / 10,000";
+ expect(str.replace(/[🔵🔴,]+/g, "")).toBe("11 54 / 10000");
+ });
+
it("identifier escapes", () => {
expectPrinted_("var _\u0076\u0061\u0072", "var _var");
expectParseError("var \u0076\u0061\u0072", 'Expected identifier but found "\u0076\u0061\u0072"');