Revert "decode regex if needed (#5167)"revert-5167-dylan/decode-regex-if-needed

This reverts commit 32664df254be225dd195fcaf46994f0c550f9d22.
author: Jarred Sumner <jarred@jarredsumner.com> 2023-09-15 04:37:13 -0700
committer: GitHub <noreply@github.com> 2023-09-15 04:37:13 -0700
commit: 8c3be19d66002f12e0d4b2b201d6745ea3a9d69b (patch)
tree: ec1a9451d49f078bc0c4faf4b2cc0c7529d1bbcd
parent: 92f2d9ab27fd2ab61b8bcfe8c0b42c7d6b90cdcf (diff)
download: bun-revert-5167-dylan/decode-regex-if-needed.tar.gz
bun-revert-5167-dylan/decode-regex-if-needed.tar.zst
bun-revert-5167-dylan/decode-regex-if-needed.zip
5 files changed, 126 insertions, 167 deletions
diff --git a/src/js_ast.zig b/src/js_ast.zig
index d4a3b1d92..7811541f4 100644
--- a/src/js_ast.zig
+++ b/src/js_ast.zig
@@ -2538,10 +2538,7 @@ pub const E = struct {
     };
 
     pub const RegExp = struct {
-        data: union(enum) {
-            raw: string,
-            decoded: bun.BabyList(u16),
-        },
+        value: string,
 
         // This exists for JavaScript bindings
         // The RegExp constructor expects flags as a second argument.
@@ -2551,7 +2548,7 @@ pub const E = struct {
         //      ^
         flags_offset: ?u16 = null,
 
-        pub var empty = RegExp{ .data = .{ .raw = "" } };
+        pub var empty = RegExp{ .value = "" };
 
         pub fn pattern(this: RegExp) string {
 
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index 47b6fdcb9..24484a02b 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -295,7 +295,7 @@ fn NewLexer_(
             this.comments_to_preserve_before.clearAndFree();
         }
 
-        pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
+        fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
             var buf = buf_.*;
             defer buf_.* = buf;
             if (comptime is_json) lexer.is_ascii_only = false;
@@ -2075,11 +2075,9 @@ fn NewLexer_(
             if (comptime is_json) unreachable;
         }
 
-        // returns true of the regex contents need to be decoded
-        pub fn scanRegExp(lexer: *LexerType) !bool {
+        pub fn scanRegExp(lexer: *LexerType) !void {
             lexer.assertNotJSON();
             lexer.regex_flags_start = null;
-            var decode = lexer.code_point >= 0x80;
             while (true) {
                 switch (lexer.code_point) {
                     '/' => {
@@ -2123,48 +2121,20 @@ fn NewLexer_(
                                 },
                             }
                         }
-
-                        return decode;
+                        return;
                     },
                     '[' => {
                         lexer.step();
-                        if (lexer.code_point >= 0x80) decode = true;
                         while (lexer.code_point != ']') {
-                            try lexer.scanRegExpValidateAndStep(&decode);
+                            try lexer.scanRegExpValidateAndStep();
                         }
                         lexer.step();
-                        if (lexer.code_point >= 0x80) decode = true;
                     },
                     else => {
-                        try lexer.scanRegExpValidateAndStep(&decode);
+                        try lexer.scanRegExpValidateAndStep();
                     },
                 }
             }
-
-            return decode;
-        }
-
-        fn scanRegExpValidateAndStep(lexer: *LexerType, decode: *bool) !void {
-            lexer.assertNotJSON();
-
-            if (lexer.code_point == '\\') {
-                lexer.step();
-                if (lexer.code_point >= 0x80) decode.* = true;
-            }
-
-            switch (lexer.code_point) {
-                '\r', '\n', 0x2028, 0x2029 => {
-                    // Newlines aren't allowed in regular expressions
-                    try lexer.syntaxError();
-                },
-                -1 => { // EOF
-                    try lexer.syntaxError();
-                },
-                else => {
-                    lexer.step();
-                    if (lexer.code_point >= 0x80) decode.* = true;
-                },
-            }
         }
 
         // TODO: use wtf-8 encoding.
@@ -2622,6 +2592,27 @@ fn NewLexer_(
             try lexer.nextInsideJSXElement();
         }
 
+        fn scanRegExpValidateAndStep(lexer: *LexerType) !void {
+            lexer.assertNotJSON();
+
+            if (lexer.code_point == '\\') {
+                lexer.step();
+            }
+
+            switch (lexer.code_point) {
+                '\r', '\n', 0x2028, 0x2029 => {
+                    // Newlines aren't allowed in regular expressions
+                    try lexer.syntaxError();
+                },
+                -1 => { // EOF
+                    try lexer.syntaxError();
+                },
+                else => {
+                    lexer.step();
+                },
+            }
+        }
+
         pub fn rescanCloseBraceAsTemplateToken(lexer: *LexerType) !void {
             lexer.assertNotJSON();
 
diff --git a/src/js_parser.zig b/src/js_parser.zig
index 2e6b9e336..a61657128 100644
--- a/src/js_parser.zig
+++ b/src/js_parser.zig
@@ -13095,39 +13095,13 @@ fn NewParser_(
                     return p.newExpr(E.BigInt{ .value = value }, loc);
                 },
                 .t_slash, .t_slash_equals => {
-                    const needs_decode = try p.lexer.scanRegExp();
+                    try p.lexer.scanRegExp();
                     // always set regex_flags_start to null to make sure we don't accidentally use the wrong value later
                     defer p.lexer.regex_flags_start = null;
-
-                    const raw = p.lexer.raw();
-
-                    if (!needs_decode) {
-                        try p.lexer.next();
-                        return p.newExpr(
-                            E.RegExp{
-                                .data = .{
-                                    .raw = raw,
-                                },
-                                .flags_offset = p.lexer.regex_flags_start,
-                            },
-                            loc,
-                        );
-                    }
-
-                    var buf = std.ArrayList(u16).initCapacity(p.allocator, raw.len) catch unreachable;
-                    try p.lexer.decodeEscapeSequences(p.lexer.start, raw, @TypeOf(buf), &buf);
-
+                    const value = p.lexer.raw();
                     try p.lexer.next();
 
-                    return p.newExpr(
-                        E.RegExp{
-                            .data = .{
-                                .decoded = bun.BabyList(u16).init(buf.items),
-                            },
-                            .flags_offset = p.lexer.regex_flags_start,
-                        },
-                        loc,
-                    );
+                    return p.newExpr(E.RegExp{ .value = value, .flags_offset = p.lexer.regex_flags_start }, loc);
                 },
                 .t_void => {
                     try p.lexer.next();
diff --git a/src/js_printer.zig b/src/js_printer.zig
index 3caa5f11e..57ef580b6 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -1437,7 +1437,7 @@ fn NewPrinter(
             ) catch unreachable;
         }
 
-        fn printUTF16(e: *Printer, text: []const u16, comptime quoted: bool, quote: u8) void {
+        pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void {
             var i: usize = 0;
             const n: usize = text.len;
 
@@ -1471,42 +1471,26 @@ fn NewPrinter(
                         e.print("\\x07");
                     },
                     0x08 => {
-                        if (comptime quoted) {
-                            if (quote == '`')
-                                e.print(0x08)
-                            else
-                                e.print("\\b");
-                        } else {
+                        if (quote == '`')
+                            e.print(0x08)
+                        else
                             e.print("\\b");
-                        }
                     },
                     0x0C => {
-                        if (comptime quoted) {
-                            if (quote == '`')
-                                e.print(0x000C)
-                            else
-                                e.print("\\f");
-                        } else {
+                        if (quote == '`')
+                            e.print(0x000C)
+                        else
                             e.print("\\f");
-                        }
                     },
                     '\t' => {
-                        if (comptime quoted) {
-                            if (quote == '`')
-                                e.print("\t")
-                            else
-                                e.print("\\t");
-                        } else {
+                        if (quote == '`')
+                            e.print("\t")
+                        else
                             e.print("\\t");
-                        }
                     },
                     '\n' => {
-                        if (comptime quoted) {
-                            if (quote == '`') {
-                                e.print('\n');
-                            } else {
-                                e.print("\\n");
-                            }
+                        if (quote == '`') {
+                            e.print('\n');
                         } else {
                             e.print("\\n");
                         }
@@ -1517,12 +1501,8 @@ fn NewPrinter(
                     },
                     // \v
                     std.ascii.control_code.vt => {
-                        if (comptime quoted) {
-                            if (quote == '`') {
-                                e.print(std.ascii.control_code.vt);
-                            } else {
-                                e.print("\\v");
-                            }
+                        if (quote == '`') {
+                            e.print(std.ascii.control_code.vt);
                         } else {
                             e.print("\\v");
                         }
@@ -1533,37 +1513,29 @@ fn NewPrinter(
                     },
 
                     '\'' => {
-                        if (comptime quoted) {
-                            if (quote == '\'') {
-                                e.print('\\');
-                            }
+                        if (quote == '\'') {
+                            e.print('\\');
                         }
                         e.print("'");
                     },
 
                     '"' => {
-                        if (comptime quoted) {
-                            if (quote == '"') {
-                                e.print('\\');
-                            }
+                        if (quote == '"') {
+                            e.print('\\');
                         }
 
                         e.print("\"");
                     },
                     '`' => {
-                        if (comptime quoted) {
-                            if (quote == '`') {
-                                e.print('\\');
-                            }
+                        if (quote == '`') {
+                            e.print('\\');
                         }
 
                         e.print("`");
                     },
                     '$' => {
-                        if (comptime quoted) {
-                            if (quote == '`' and i < n and text[i] == '{') {
-                                e.print('\\');
-                            }
+                        if (quote == '`' and i < n and text[i] == '{') {
+                            e.print('\\');
                         }
 
                         e.print('$');
@@ -1587,34 +1559,32 @@ fn NewPrinter(
                                 // this only applies to template literal strings
                                 // but we print a template literal if there is a \n or a \r
                                 // which is often if the string is long and UTF-16
-                                if (comptime quoted) {
-                                    if (quote == '`') {
-                                        const remain = text[i..];
-                                        if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
-                                            remain[0] != '$' and
-                                            remain[0] != '\\' and
-                                            remain[0] != '`')
-                                        {
-                                            if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
-                                                if (count_ == 0)
-                                                    unreachable; // conditional above checks this
-
-                                                const len = count_ - 1;
-                                                i += len;
-                                                var ptr = e.writer.reserve(len) catch unreachable;
-                                                var to_copy = ptr[0..len];
-
-                                                strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
-                                                e.writer.advance(len);
-                                                continue :outer;
-                                            } else {
-                                                const count = @as(u32, @truncate(remain.len));
-                                                var ptr = e.writer.reserve(count) catch unreachable;
-                                                var to_copy = ptr[0..count];
-                                                strings.copyU16IntoU8(to_copy, []const u16, remain);
-                                                e.writer.advance(count);
-                                                i += count;
-                                            }
+                                if (quote == '`') {
+                                    const remain = text[i..];
+                                    if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
+                                        remain[0] != '$' and
+                                        remain[0] != '\\' and
+                                        remain[0] != '`')
+                                    {
+                                        if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
+                                            if (count_ == 0)
+                                                unreachable; // conditional above checks this
+
+                                            const len = count_ - 1;
+                                            i += len;
+                                            var ptr = e.writer.reserve(len) catch unreachable;
+                                            var to_copy = ptr[0..len];
+
+                                            strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
+                                            e.writer.advance(len);
+                                            continue :outer;
+                                        } else {
+                                            const count = @as(u32, @truncate(remain.len));
+                                            var ptr = e.writer.reserve(count) catch unreachable;
+                                            var to_copy = ptr[0..count];
+                                            strings.copyU16IntoU8(to_copy, []const u16, remain);
+                                            e.writer.advance(count);
+                                            i += count;
                                         }
                                     }
                                 }
@@ -1694,14 +1664,6 @@ fn NewPrinter(
             }
         }
 
-        pub fn printUnquotedUTF16(p: *Printer, text: []const u16) void {
-            p.printUTF16(text, false, 0);
-        }
-
-        pub fn printQuotedUTF16(p: *Printer, text: []const u16, quote: u8) void {
-            p.printUTF16(text, true, quote);
-        }
-
         pub fn isUnboundEvalIdentifier(p: *Printer, value: Expr) bool {
             switch (value.data) {
                 .e_identifier => |ident| {
@@ -3187,13 +3149,53 @@ fn NewPrinter(
                 p.print(" ");
             }
 
-            switch (e.data) {
-                .raw => |raw| {
-                    p.print(raw);
-                },
-                .decoded => |decoded| {
-                    p.printUnquotedUTF16(decoded.slice());
-                },
+            if (comptime is_bun_platform) {
+                // Translate any non-ASCII to unicode escape sequences
+                var ascii_start: usize = 0;
+                var is_ascii = false;
+                var iter = CodepointIterator.init(e.value);
+                var cursor = CodepointIterator.Cursor{};
+                while (iter.next(&cursor)) {
+                    switch (cursor.c) {
+                        first_ascii...last_ascii => {
+                            if (!is_ascii) {
+                                ascii_start = cursor.i;
+                                is_ascii = true;
+                            }
+                        },
+                        else => {
+                            if (is_ascii) {
+                                p.print(e.value[ascii_start..cursor.i]);
+                                is_ascii = false;
+                            }
+
+                            switch (cursor.c) {
+                                0...0xFFFF => {
+                                    p.print([_]u8{
+                                        '\\',
+                                        'u',
+                                        hex_chars[cursor.c >> 12],
+                                        hex_chars[(cursor.c >> 8) & 15],
+                                        hex_chars[(cursor.c >> 4) & 15],
+                                        hex_chars[cursor.c & 15],
+                                    });
+                                },
+                                else => {
+                                    p.print("\\u{");
+                                    std.fmt.formatInt(cursor.c, 16, .lower, .{}, p) catch unreachable;
+                                    p.print("}");
+                                },
+                            }
+                        },
+                    }
+                }
+
+                if (is_ascii) {
+                    p.print(e.value[ascii_start..]);
+                }
+            } else {
+                // UTF8 sequence is fine
+                p.print(e.value);
             }
 
             // Need a space before the next identifier to avoid it turning into flags
diff --git a/test/transpiler/transpiler.test.js b/test/transpiler/transpiler.test.js
index c80a0670a..a6c2dcf73 100644
--- a/test/transpiler/transpiler.test.js
+++ b/test/transpiler/transpiler.test.js
@@ -1973,11 +1973,6 @@ console.log(resolve.length)
       expectParseError("/x/msuygig", 'Duplicate flag "g" in regular expression');
     });
 
-    it("non-ascii regexp literals", () => {
-      var str = "🔴11 54 / 10,000";
-      expect(str.replace(/[🔵🔴,]+/g, "")).toBe("11 54 / 10000");
-    });
-
     it("identifier escapes", () => {
       expectPrinted_("var _\u0076\u0061\u0072", "var _var");
       expectParseError("var \u0076\u0061\u0072", 'Expected identifier but found "\u0076\u0061\u0072"');
author	Jarred Sumner <jarred@jarredsumner.com>	2023-09-15 04:37:13 -0700
committer	GitHub <noreply@github.com>	2023-09-15 04:37:13 -0700
commit	8c3be19d66002f12e0d4b2b201d6745ea3a9d69b (patch)
tree	ec1a9451d49f078bc0c4faf4b2cc0c7529d1bbcd
parent	92f2d9ab27fd2ab61b8bcfe8c0b42c7d6b90cdcf (diff)
download	bun-revert-5167-dylan/decode-regex-if-needed.tar.gz bun-revert-5167-dylan/decode-regex-if-needed.tar.zst bun-revert-5167-dylan/decode-regex-if-needed.zip