diff options
author | 2023-09-15 04:37:13 -0700 | |
---|---|---|
committer | 2023-09-15 04:37:13 -0700 | |
commit | 8c3be19d66002f12e0d4b2b201d6745ea3a9d69b (patch) | |
tree | ec1a9451d49f078bc0c4faf4b2cc0c7529d1bbcd | |
parent | 92f2d9ab27fd2ab61b8bcfe8c0b42c7d6b90cdcf (diff) | |
download | bun-revert-5167-dylan/decode-regex-if-needed.tar.gz bun-revert-5167-dylan/decode-regex-if-needed.tar.zst bun-revert-5167-dylan/decode-regex-if-needed.zip |
Revert "decode regex if needed (#5167)"revert-5167-dylan/decode-regex-if-needed
This reverts commit 32664df254be225dd195fcaf46994f0c550f9d22.
-rw-r--r-- | src/js_ast.zig | 7 | ||||
-rw-r--r-- | src/js_lexer.zig | 61 | ||||
-rw-r--r-- | src/js_parser.zig | 32 | ||||
-rw-r--r-- | src/js_printer.zig | 188 | ||||
-rw-r--r-- | test/transpiler/transpiler.test.js | 5 |
5 files changed, 126 insertions, 167 deletions
diff --git a/src/js_ast.zig b/src/js_ast.zig index d4a3b1d92..7811541f4 100644 --- a/src/js_ast.zig +++ b/src/js_ast.zig @@ -2538,10 +2538,7 @@ pub const E = struct { }; pub const RegExp = struct { - data: union(enum) { - raw: string, - decoded: bun.BabyList(u16), - }, + value: string, // This exists for JavaScript bindings // The RegExp constructor expects flags as a second argument. @@ -2551,7 +2548,7 @@ pub const E = struct { // ^ flags_offset: ?u16 = null, - pub var empty = RegExp{ .data = .{ .raw = "" } }; + pub var empty = RegExp{ .value = "" }; pub fn pattern(this: RegExp) string { diff --git a/src/js_lexer.zig b/src/js_lexer.zig index 47b6fdcb9..24484a02b 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -295,7 +295,7 @@ fn NewLexer_( this.comments_to_preserve_before.clearAndFree(); } - pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { + fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { var buf = buf_.*; defer buf_.* = buf; if (comptime is_json) lexer.is_ascii_only = false; @@ -2075,11 +2075,9 @@ fn NewLexer_( if (comptime is_json) unreachable; } - // returns true of the regex contents need to be decoded - pub fn scanRegExp(lexer: *LexerType) !bool { + pub fn scanRegExp(lexer: *LexerType) !void { lexer.assertNotJSON(); lexer.regex_flags_start = null; - var decode = lexer.code_point >= 0x80; while (true) { switch (lexer.code_point) { '/' => { @@ -2123,48 +2121,20 @@ fn NewLexer_( }, } } - - return decode; + return; }, '[' => { lexer.step(); - if (lexer.code_point >= 0x80) decode = true; while (lexer.code_point != ']') { - try lexer.scanRegExpValidateAndStep(&decode); + try lexer.scanRegExpValidateAndStep(); } lexer.step(); - if (lexer.code_point >= 0x80) decode = true; }, else => { - try lexer.scanRegExpValidateAndStep(&decode); + try lexer.scanRegExpValidateAndStep(); }, } } - - return decode; - } - - fn scanRegExpValidateAndStep(lexer: *LexerType, decode: *bool) !void { - lexer.assertNotJSON(); - - if (lexer.code_point == '\\') { - lexer.step(); - if (lexer.code_point >= 0x80) decode.* = true; - } - - switch (lexer.code_point) { - '\r', '\n', 0x2028, 0x2029 => { - // Newlines aren't allowed in regular expressions - try lexer.syntaxError(); - }, - -1 => { // EOF - try lexer.syntaxError(); - }, - else => { - lexer.step(); - if (lexer.code_point >= 0x80) decode.* = true; - }, - } } // TODO: use wtf-8 encoding. @@ -2622,6 +2592,27 @@ fn NewLexer_( try lexer.nextInsideJSXElement(); } + fn scanRegExpValidateAndStep(lexer: *LexerType) !void { + lexer.assertNotJSON(); + + if (lexer.code_point == '\\') { + lexer.step(); + } + + switch (lexer.code_point) { + '\r', '\n', 0x2028, 0x2029 => { + // Newlines aren't allowed in regular expressions + try lexer.syntaxError(); + }, + -1 => { // EOF + try lexer.syntaxError(); + }, + else => { + lexer.step(); + }, + } + } + pub fn rescanCloseBraceAsTemplateToken(lexer: *LexerType) !void { lexer.assertNotJSON(); diff --git a/src/js_parser.zig b/src/js_parser.zig index 2e6b9e336..a61657128 100644 --- a/src/js_parser.zig +++ b/src/js_parser.zig @@ -13095,39 +13095,13 @@ fn NewParser_( return p.newExpr(E.BigInt{ .value = value }, loc); }, .t_slash, .t_slash_equals => { - const needs_decode = try p.lexer.scanRegExp(); + try p.lexer.scanRegExp(); // always set regex_flags_start to null to make sure we don't accidentally use the wrong value later defer p.lexer.regex_flags_start = null; - - const raw = p.lexer.raw(); - - if (!needs_decode) { - try p.lexer.next(); - return p.newExpr( - E.RegExp{ - .data = .{ - .raw = raw, - }, - .flags_offset = p.lexer.regex_flags_start, - }, - loc, - ); - } - - var buf = std.ArrayList(u16).initCapacity(p.allocator, raw.len) catch unreachable; - try p.lexer.decodeEscapeSequences(p.lexer.start, raw, @TypeOf(buf), &buf); - + const value = p.lexer.raw(); try p.lexer.next(); - return p.newExpr( - E.RegExp{ - .data = .{ - .decoded = bun.BabyList(u16).init(buf.items), - }, - .flags_offset = p.lexer.regex_flags_start, - }, - loc, - ); + return p.newExpr(E.RegExp{ .value = value, .flags_offset = p.lexer.regex_flags_start }, loc); }, .t_void => { try p.lexer.next(); diff --git a/src/js_printer.zig b/src/js_printer.zig index 3caa5f11e..57ef580b6 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -1437,7 +1437,7 @@ fn NewPrinter( ) catch unreachable; } - fn printUTF16(e: *Printer, text: []const u16, comptime quoted: bool, quote: u8) void { + pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void { var i: usize = 0; const n: usize = text.len; @@ -1471,42 +1471,26 @@ fn NewPrinter( e.print("\\x07"); }, 0x08 => { - if (comptime quoted) { - if (quote == '`') - e.print(0x08) - else - e.print("\\b"); - } else { + if (quote == '`') + e.print(0x08) + else e.print("\\b"); - } }, 0x0C => { - if (comptime quoted) { - if (quote == '`') - e.print(0x000C) - else - e.print("\\f"); - } else { + if (quote == '`') + e.print(0x000C) + else e.print("\\f"); - } }, '\t' => { - if (comptime quoted) { - if (quote == '`') - e.print("\t") - else - e.print("\\t"); - } else { + if (quote == '`') + e.print("\t") + else e.print("\\t"); - } }, '\n' => { - if (comptime quoted) { - if (quote == '`') { - e.print('\n'); - } else { - e.print("\\n"); - } + if (quote == '`') { + e.print('\n'); } else { e.print("\\n"); } @@ -1517,12 +1501,8 @@ fn NewPrinter( }, // \v std.ascii.control_code.vt => { - if (comptime quoted) { - if (quote == '`') { - e.print(std.ascii.control_code.vt); - } else { - e.print("\\v"); - } + if (quote == '`') { + e.print(std.ascii.control_code.vt); } else { e.print("\\v"); } @@ -1533,37 +1513,29 @@ fn NewPrinter( }, '\'' => { - if (comptime quoted) { - if (quote == '\'') { - e.print('\\'); - } + if (quote == '\'') { + e.print('\\'); } e.print("'"); }, '"' => { - if (comptime quoted) { - if (quote == '"') { - e.print('\\'); - } + if (quote == '"') { + e.print('\\'); } e.print("\""); }, '`' => { - if (comptime quoted) { - if (quote == '`') { - e.print('\\'); - } + if (quote == '`') { + e.print('\\'); } e.print("`"); }, '$' => { - if (comptime quoted) { - if (quote == '`' and i < n and text[i] == '{') { - e.print('\\'); - } + if (quote == '`' and i < n and text[i] == '{') { + e.print('\\'); } e.print('$'); @@ -1587,34 +1559,32 @@ fn NewPrinter( // this only applies to template literal strings // but we print a template literal if there is a \n or a \r // which is often if the string is long and UTF-16 - if (comptime quoted) { - if (quote == '`') { - const remain = text[i..]; - if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and - remain[0] != '$' and - remain[0] != '\\' and - remain[0] != '`') - { - if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| { - if (count_ == 0) - unreachable; // conditional above checks this - - const len = count_ - 1; - i += len; - var ptr = e.writer.reserve(len) catch unreachable; - var to_copy = ptr[0..len]; - - strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]); - e.writer.advance(len); - continue :outer; - } else { - const count = @as(u32, @truncate(remain.len)); - var ptr = e.writer.reserve(count) catch unreachable; - var to_copy = ptr[0..count]; - strings.copyU16IntoU8(to_copy, []const u16, remain); - e.writer.advance(count); - i += count; - } + if (quote == '`') { + const remain = text[i..]; + if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and + remain[0] != '$' and + remain[0] != '\\' and + remain[0] != '`') + { + if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| { + if (count_ == 0) + unreachable; // conditional above checks this + + const len = count_ - 1; + i += len; + var ptr = e.writer.reserve(len) catch unreachable; + var to_copy = ptr[0..len]; + + strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]); + e.writer.advance(len); + continue :outer; + } else { + const count = @as(u32, @truncate(remain.len)); + var ptr = e.writer.reserve(count) catch unreachable; + var to_copy = ptr[0..count]; + strings.copyU16IntoU8(to_copy, []const u16, remain); + e.writer.advance(count); + i += count; } } } @@ -1694,14 +1664,6 @@ fn NewPrinter( } } - pub fn printUnquotedUTF16(p: *Printer, text: []const u16) void { - p.printUTF16(text, false, 0); - } - - pub fn printQuotedUTF16(p: *Printer, text: []const u16, quote: u8) void { - p.printUTF16(text, true, quote); - } - pub fn isUnboundEvalIdentifier(p: *Printer, value: Expr) bool { switch (value.data) { .e_identifier => |ident| { @@ -3187,13 +3149,53 @@ fn NewPrinter( p.print(" "); } - switch (e.data) { - .raw => |raw| { - p.print(raw); - }, - .decoded => |decoded| { - p.printUnquotedUTF16(decoded.slice()); - }, + if (comptime is_bun_platform) { + // Translate any non-ASCII to unicode escape sequences + var ascii_start: usize = 0; + var is_ascii = false; + var iter = CodepointIterator.init(e.value); + var cursor = CodepointIterator.Cursor{}; + while (iter.next(&cursor)) { + switch (cursor.c) { + first_ascii...last_ascii => { + if (!is_ascii) { + ascii_start = cursor.i; + is_ascii = true; + } + }, + else => { + if (is_ascii) { + p.print(e.value[ascii_start..cursor.i]); + is_ascii = false; + } + + switch (cursor.c) { + 0...0xFFFF => { + p.print([_]u8{ + '\\', + 'u', + hex_chars[cursor.c >> 12], + hex_chars[(cursor.c >> 8) & 15], + hex_chars[(cursor.c >> 4) & 15], + hex_chars[cursor.c & 15], + }); + }, + else => { + p.print("\\u{"); + std.fmt.formatInt(cursor.c, 16, .lower, .{}, p) catch unreachable; + p.print("}"); + }, + } + }, + } + } + + if (is_ascii) { + p.print(e.value[ascii_start..]); + } + } else { + // UTF8 sequence is fine + p.print(e.value); } // Need a space before the next identifier to avoid it turning into flags diff --git a/test/transpiler/transpiler.test.js b/test/transpiler/transpiler.test.js index c80a0670a..a6c2dcf73 100644 --- a/test/transpiler/transpiler.test.js +++ b/test/transpiler/transpiler.test.js @@ -1973,11 +1973,6 @@ console.log(resolve.length) expectParseError("/x/msuygig", 'Duplicate flag "g" in regular expression'); }); - it("non-ascii regexp literals", () => { - var str = "🔴11 54 / 10,000"; - expect(str.replace(/[🔵🔴,]+/g, "")).toBe("11 54 / 10000"); - }); - it("identifier escapes", () => { expectPrinted_("var _\u0076\u0061\u0072", "var _var"); expectParseError("var \u0076\u0061\u0072", 'Expected identifier but found "\u0076\u0061\u0072"'); |