decode regex if needed (#5167)

* decode regex if non-ascii * make it comptime * add test * use `bun.BabyList(u16)`
author: Dylan Conway <35280289+dylan-conway@users.noreply.github.com> 2023-09-13 01:26:18 -0700
committer: GitHub <noreply@github.com> 2023-09-13 01:26:18 -0700
commit: 32664df254be225dd195fcaf46994f0c550f9d22 (patch)
tree: 3636f2c68bf3e8926eae3c83ee5930968023a02e
parent: 15f7bacb8bd57615475e1614f5f93b23810e63b1 (diff)
download: bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.gz
bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.zst
bun-32664df254be225dd195fcaf46994f0c550f9d22.zip
5 files changed, 167 insertions, 126 deletions
diff --git a/src/js_ast.zig b/src/js_ast.zig
index 7811541f4..d4a3b1d92 100644
--- a/src/js_ast.zig
+++ b/src/js_ast.zig
@@ -2538,7 +2538,10 @@ pub const E = struct {
     };
 
     pub const RegExp = struct {
-        value: string,
+        data: union(enum) {
+            raw: string,
+            decoded: bun.BabyList(u16),
+        },
 
         // This exists for JavaScript bindings
         // The RegExp constructor expects flags as a second argument.
@@ -2548,7 +2551,7 @@ pub const E = struct {
         //      ^
         flags_offset: ?u16 = null,
 
-        pub var empty = RegExp{ .value = "" };
+        pub var empty = RegExp{ .data = .{ .raw = "" } };
 
         pub fn pattern(this: RegExp) string {
 
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index 1417ea01e..01852bb65 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -295,7 +295,7 @@ fn NewLexer_(
             this.comments_to_preserve_before.clearAndFree();
         }
 
-        fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
+        pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
             var buf = buf_.*;
             defer buf_.* = buf;
             if (comptime is_json) lexer.is_ascii_only = false;
@@ -2075,9 +2075,11 @@ fn NewLexer_(
             if (comptime is_json) unreachable;
         }
 
-        pub fn scanRegExp(lexer: *LexerType) !void {
+        // returns true of the regex contents need to be decoded
+        pub fn scanRegExp(lexer: *LexerType) !bool {
             lexer.assertNotJSON();
             lexer.regex_flags_start = null;
+            var decode = lexer.code_point >= 0x80;
             while (true) {
                 switch (lexer.code_point) {
                     '/' => {
@@ -2121,20 +2123,48 @@ fn NewLexer_(
                                 },
                             }
                         }
-                        return;
+
+                        return decode;
                     },
                     '[' => {
                         lexer.step();
+                        if (lexer.code_point >= 0x80) decode = true;
                         while (lexer.code_point != ']') {
-                            try lexer.scanRegExpValidateAndStep();
+                            try lexer.scanRegExpValidateAndStep(&decode);
                         }
                         lexer.step();
+                        if (lexer.code_point >= 0x80) decode = true;
                     },
                     else => {
-                        try lexer.scanRegExpValidateAndStep();
+                        try lexer.scanRegExpValidateAndStep(&decode);
                     },
                 }
             }
+
+            return decode;
+        }
+
+        fn scanRegExpValidateAndStep(lexer: *LexerType, decode: *bool) !void {
+            lexer.assertNotJSON();
+
+            if (lexer.code_point == '\\') {
+                lexer.step();
+                if (lexer.code_point >= 0x80) decode.* = true;
+            }
+
+            switch (lexer.code_point) {
+                '\r', '\n', 0x2028, 0x2029 => {
+                    // Newlines aren't allowed in regular expressions
+                    try lexer.syntaxError();
+                },
+                -1 => { // EOF
+                    try lexer.syntaxError();
+                },
+                else => {
+                    lexer.step();
+                    if (lexer.code_point >= 0x80) decode.* = true;
+                },
+            }
         }
 
         // TODO: use wtf-8 encoding.
@@ -2592,27 +2622,6 @@ fn NewLexer_(
             try lexer.nextInsideJSXElement();
         }
 
-        fn scanRegExpValidateAndStep(lexer: *LexerType) !void {
-            lexer.assertNotJSON();
-
-            if (lexer.code_point == '\\') {
-                lexer.step();
-            }
-
-            switch (lexer.code_point) {
-                '\r', '\n', 0x2028, 0x2029 => {
-                    // Newlines aren't allowed in regular expressions
-                    try lexer.syntaxError();
-                },
-                -1 => { // EOF
-                    try lexer.syntaxError();
-                },
-                else => {
-                    lexer.step();
-                },
-            }
-        }
-
         pub fn rescanCloseBraceAsTemplateToken(lexer: *LexerType) !void {
             lexer.assertNotJSON();
 
diff --git a/src/js_parser.zig b/src/js_parser.zig
index a61657128..2e6b9e336 100644
--- a/src/js_parser.zig
+++ b/src/js_parser.zig
@@ -13095,13 +13095,39 @@ fn NewParser_(
                     return p.newExpr(E.BigInt{ .value = value }, loc);
                 },
                 .t_slash, .t_slash_equals => {
-                    try p.lexer.scanRegExp();
+                    const needs_decode = try p.lexer.scanRegExp();
                     // always set regex_flags_start to null to make sure we don't accidentally use the wrong value later
                     defer p.lexer.regex_flags_start = null;
-                    const value = p.lexer.raw();
+
+                    const raw = p.lexer.raw();
+
+                    if (!needs_decode) {
+                        try p.lexer.next();
+                        return p.newExpr(
+                            E.RegExp{
+                                .data = .{
+                                    .raw = raw,
+                                },
+                                .flags_offset = p.lexer.regex_flags_start,
+                            },
+                            loc,
+                        );
+                    }
+
+                    var buf = std.ArrayList(u16).initCapacity(p.allocator, raw.len) catch unreachable;
+                    try p.lexer.decodeEscapeSequences(p.lexer.start, raw, @TypeOf(buf), &buf);
+
                     try p.lexer.next();
 
-                    return p.newExpr(E.RegExp{ .value = value, .flags_offset = p.lexer.regex_flags_start }, loc);
+                    return p.newExpr(
+                        E.RegExp{
+                            .data = .{
+                                .decoded = bun.BabyList(u16).init(buf.items),
+                            },
+                            .flags_offset = p.lexer.regex_flags_start,
+                        },
+                        loc,
+                    );
                 },
                 .t_void => {
                     try p.lexer.next();
diff --git a/src/js_printer.zig b/src/js_printer.zig
index 57ef580b6..3caa5f11e 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -1437,7 +1437,7 @@ fn NewPrinter(
             ) catch unreachable;
         }
 
-        pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void {
+        fn printUTF16(e: *Printer, text: []const u16, comptime quoted: bool, quote: u8) void {
             var i: usize = 0;
             const n: usize = text.len;
 
@@ -1471,26 +1471,42 @@ fn NewPrinter(
                         e.print("\\x07");
                     },
                     0x08 => {
-                        if (quote == '`')
-                            e.print(0x08)
-                        else
+                        if (comptime quoted) {
+                            if (quote == '`')
+                                e.print(0x08)
+                            else
+                                e.print("\\b");
+                        } else {
                             e.print("\\b");
+                        }
                     },
                     0x0C => {
-                        if (quote == '`')
-                            e.print(0x000C)
-                        else
+                        if (comptime quoted) {
+                            if (quote == '`')
+                                e.print(0x000C)
+                            else
+                                e.print("\\f");
+                        } else {
                             e.print("\\f");
+                        }
                     },
                     '\t' => {
-                        if (quote == '`')
-                            e.print("\t")
-                        else
+                        if (comptime quoted) {
+                            if (quote == '`')
+                                e.print("\t")
+                            else
+                                e.print("\\t");
+                        } else {
                             e.print("\\t");
+                        }
                     },
                     '\n' => {
-                        if (quote == '`') {
-                            e.print('\n');
+                        if (comptime quoted) {
+                            if (quote == '`') {
+                                e.print('\n');
+                            } else {
+                                e.print("\\n");
+                            }
                         } else {
                             e.print("\\n");
                         }
@@ -1501,8 +1517,12 @@ fn NewPrinter(
                     },
                     // \v
                     std.ascii.control_code.vt => {
-                        if (quote == '`') {
-                            e.print(std.ascii.control_code.vt);
+                        if (comptime quoted) {
+                            if (quote == '`') {
+                                e.print(std.ascii.control_code.vt);
+                            } else {
+                                e.print("\\v");
+                            }
                         } else {
                             e.print("\\v");
                         }
@@ -1513,29 +1533,37 @@ fn NewPrinter(
                     },
 
                     '\'' => {
-                        if (quote == '\'') {
-                            e.print('\\');
+                        if (comptime quoted) {
+                            if (quote == '\'') {
+                                e.print('\\');
+                            }
                         }
                         e.print("'");
                     },
 
                     '"' => {
-                        if (quote == '"') {
-                            e.print('\\');
+                        if (comptime quoted) {
+                            if (quote == '"') {
+                                e.print('\\');
+                            }
                         }
 
                         e.print("\"");
                     },
                     '`' => {
-                        if (quote == '`') {
-                            e.print('\\');
+                        if (comptime quoted) {
+                            if (quote == '`') {
+                                e.print('\\');
+                            }
                         }
 
                         e.print("`");
                     },
                     '$' => {
-                        if (quote == '`' and i < n and text[i] == '{') {
-                            e.print('\\');
+                        if (comptime quoted) {
+                            if (quote == '`' and i < n and text[i] == '{') {
+                                e.print('\\');
+                            }
                         }
 
                         e.print('$');
@@ -1559,32 +1587,34 @@ fn NewPrinter(
                                 // this only applies to template literal strings
                                 // but we print a template literal if there is a \n or a \r
                                 // which is often if the string is long and UTF-16
-                                if (quote == '`') {
-                                    const remain = text[i..];
-                                    if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
-                                        remain[0] != '$' and
-                                        remain[0] != '\\' and
-                                        remain[0] != '`')
-                                    {
-                                        if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
-                                            if (count_ == 0)
-                                                unreachable; // conditional above checks this
-
-                                            const len = count_ - 1;
-                                            i += len;
-                                            var ptr = e.writer.reserve(len) catch unreachable;
-                                            var to_copy = ptr[0..len];
-
-                                            strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
-                                            e.writer.advance(len);
-                                            continue :outer;
-                                        } else {
-                                            const count = @as(u32, @truncate(remain.len));
-                                            var ptr = e.writer.reserve(count) catch unreachable;
-                                            var to_copy = ptr[0..count];
-                                            strings.copyU16IntoU8(to_copy, []const u16, remain);
-                                            e.writer.advance(count);
-                                            i += count;
+                                if (comptime quoted) {
+                                    if (quote == '`') {
+                                        const remain = text[i..];
+                                        if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
+                                            remain[0] != '$' and
+                                            remain[0] != '\\' and
+                                            remain[0] != '`')
+                                        {
+                                            if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
+                                                if (count_ == 0)
+                                                    unreachable; // conditional above checks this
+
+                                                const len = count_ - 1;
+                                                i += len;
+                                                var ptr = e.writer.reserve(len) catch unreachable;
+                                                var to_copy = ptr[0..len];
+
+                                                strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
+                                                e.writer.advance(len);
+                                                continue :outer;
+                                            } else {
+                                                const count = @as(u32, @truncate(remain.len));
+                                                var ptr = e.writer.reserve(count) catch unreachable;
+                                                var to_copy = ptr[0..count];
+                                                strings.copyU16IntoU8(to_copy, []const u16, remain);
+                                                e.writer.advance(count);
+                                                i += count;
+                                            }
                                         }
                                     }
                                 }
@@ -1664,6 +1694,14 @@ fn NewPrinter(
             }
         }
 
+        pub fn printUnquotedUTF16(p: *Printer, text: []const u16) void {
+            p.printUTF16(text, false, 0);
+        }
+
+        pub fn printQuotedUTF16(p: *Printer, text: []const u16, quote: u8) void {
+            p.printUTF16(text, true, quote);
+        }
+
         pub fn isUnboundEvalIdentifier(p: *Printer, value: Expr) bool {
             switch (value.data) {
                 .e_identifier => |ident| {
@@ -3149,53 +3187,13 @@ fn NewPrinter(
                 p.print(" ");
             }
 
-            if (comptime is_bun_platform) {
-                // Translate any non-ASCII to unicode escape sequences
-                var ascii_start: usize = 0;
-                var is_ascii = false;
-                var iter = CodepointIterator.init(e.value);
-                var cursor = CodepointIterator.Cursor{};
-                while (iter.next(&cursor)) {
-                    switch (cursor.c) {
-                        first_ascii...last_ascii => {
-                            if (!is_ascii) {
-                                ascii_start = cursor.i;
-                                is_ascii = true;
-                            }
-                        },
-                        else => {
-                            if (is_ascii) {
-                                p.print(e.value[ascii_start..cursor.i]);
-                                is_ascii = false;
-                            }
-
-                            switch (cursor.c) {
-                                0...0xFFFF => {
-                                    p.print([_]u8{
-                                        '\\',
-                                        'u',
-                                        hex_chars[cursor.c >> 12],
-                                        hex_chars[(cursor.c >> 8) & 15],
-                                        hex_chars[(cursor.c >> 4) & 15],
-                                        hex_chars[cursor.c & 15],
-                                    });
-                                },
-                                else => {
-                                    p.print("\\u{");
-                                    std.fmt.formatInt(cursor.c, 16, .lower, .{}, p) catch unreachable;
-                                    p.print("}");
-                                },
-                            }
-                        },
-                    }
-                }
-
-                if (is_ascii) {
-                    p.print(e.value[ascii_start..]);
-                }
-            } else {
-                // UTF8 sequence is fine
-                p.print(e.value);
+            switch (e.data) {
+                .raw => |raw| {
+                    p.print(raw);
+                },
+                .decoded => |decoded| {
+                    p.printUnquotedUTF16(decoded.slice());
+                },
             }
 
             // Need a space before the next identifier to avoid it turning into flags
diff --git a/test/transpiler/transpiler.test.js b/test/transpiler/transpiler.test.js
index a6c2dcf73..c80a0670a 100644
--- a/test/transpiler/transpiler.test.js
+++ b/test/transpiler/transpiler.test.js
@@ -1973,6 +1973,11 @@ console.log(resolve.length)
       expectParseError("/x/msuygig", 'Duplicate flag "g" in regular expression');
     });
 
+    it("non-ascii regexp literals", () => {
+      var str = "🔴11 54 / 10,000";
+      expect(str.replace(/[🔵🔴,]+/g, "")).toBe("11 54 / 10000");
+    });
+
     it("identifier escapes", () => {
       expectPrinted_("var _\u0076\u0061\u0072", "var _var");
       expectParseError("var \u0076\u0061\u0072", 'Expected identifier but found "\u0076\u0061\u0072"');
author	Dylan Conway <35280289+dylan-conway@users.noreply.github.com>	2023-09-13 01:26:18 -0700
committer	GitHub <noreply@github.com>	2023-09-13 01:26:18 -0700
commit	32664df254be225dd195fcaf46994f0c550f9d22 (patch)
tree	3636f2c68bf3e8926eae3c83ee5930968023a02e
parent	15f7bacb8bd57615475e1614f5f93b23810e63b1 (diff)
download	bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.gz bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.zst bun-32664df254be225dd195fcaf46994f0c550f9d22.zip