diff options
author | 2023-09-13 01:26:18 -0700 | |
---|---|---|
committer | 2023-09-13 01:26:18 -0700 | |
commit | 32664df254be225dd195fcaf46994f0c550f9d22 (patch) | |
tree | 3636f2c68bf3e8926eae3c83ee5930968023a02e | |
parent | 15f7bacb8bd57615475e1614f5f93b23810e63b1 (diff) | |
download | bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.gz bun-32664df254be225dd195fcaf46994f0c550f9d22.tar.zst bun-32664df254be225dd195fcaf46994f0c550f9d22.zip |
decode regex if needed (#5167)
* decode regex if non-ascii
* make it comptime
* add test
* use `bun.BabyList(u16)`
-rw-r--r-- | src/js_ast.zig | 7 | ||||
-rw-r--r-- | src/js_lexer.zig | 61 | ||||
-rw-r--r-- | src/js_parser.zig | 32 | ||||
-rw-r--r-- | src/js_printer.zig | 188 | ||||
-rw-r--r-- | test/transpiler/transpiler.test.js | 5 |
5 files changed, 167 insertions, 126 deletions
diff --git a/src/js_ast.zig b/src/js_ast.zig index 7811541f4..d4a3b1d92 100644 --- a/src/js_ast.zig +++ b/src/js_ast.zig @@ -2538,7 +2538,10 @@ pub const E = struct { }; pub const RegExp = struct { - value: string, + data: union(enum) { + raw: string, + decoded: bun.BabyList(u16), + }, // This exists for JavaScript bindings // The RegExp constructor expects flags as a second argument. @@ -2548,7 +2551,7 @@ pub const E = struct { // ^ flags_offset: ?u16 = null, - pub var empty = RegExp{ .value = "" }; + pub var empty = RegExp{ .data = .{ .raw = "" } }; pub fn pattern(this: RegExp) string { diff --git a/src/js_lexer.zig b/src/js_lexer.zig index 1417ea01e..01852bb65 100644 --- a/src/js_lexer.zig +++ b/src/js_lexer.zig @@ -295,7 +295,7 @@ fn NewLexer_( this.comments_to_preserve_before.clearAndFree(); } - fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { + pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void { var buf = buf_.*; defer buf_.* = buf; if (comptime is_json) lexer.is_ascii_only = false; @@ -2075,9 +2075,11 @@ fn NewLexer_( if (comptime is_json) unreachable; } - pub fn scanRegExp(lexer: *LexerType) !void { + // returns true of the regex contents need to be decoded + pub fn scanRegExp(lexer: *LexerType) !bool { lexer.assertNotJSON(); lexer.regex_flags_start = null; + var decode = lexer.code_point >= 0x80; while (true) { switch (lexer.code_point) { '/' => { @@ -2121,20 +2123,48 @@ fn NewLexer_( }, } } - return; + + return decode; }, '[' => { lexer.step(); + if (lexer.code_point >= 0x80) decode = true; while (lexer.code_point != ']') { - try lexer.scanRegExpValidateAndStep(); + try lexer.scanRegExpValidateAndStep(&decode); } lexer.step(); + if (lexer.code_point >= 0x80) decode = true; }, else => { - try lexer.scanRegExpValidateAndStep(); + try lexer.scanRegExpValidateAndStep(&decode); }, } } + + return decode; + } + + fn scanRegExpValidateAndStep(lexer: *LexerType, decode: *bool) !void { + lexer.assertNotJSON(); + + if (lexer.code_point == '\\') { + lexer.step(); + if (lexer.code_point >= 0x80) decode.* = true; + } + + switch (lexer.code_point) { + '\r', '\n', 0x2028, 0x2029 => { + // Newlines aren't allowed in regular expressions + try lexer.syntaxError(); + }, + -1 => { // EOF + try lexer.syntaxError(); + }, + else => { + lexer.step(); + if (lexer.code_point >= 0x80) decode.* = true; + }, + } } // TODO: use wtf-8 encoding. @@ -2592,27 +2622,6 @@ fn NewLexer_( try lexer.nextInsideJSXElement(); } - fn scanRegExpValidateAndStep(lexer: *LexerType) !void { - lexer.assertNotJSON(); - - if (lexer.code_point == '\\') { - lexer.step(); - } - - switch (lexer.code_point) { - '\r', '\n', 0x2028, 0x2029 => { - // Newlines aren't allowed in regular expressions - try lexer.syntaxError(); - }, - -1 => { // EOF - try lexer.syntaxError(); - }, - else => { - lexer.step(); - }, - } - } - pub fn rescanCloseBraceAsTemplateToken(lexer: *LexerType) !void { lexer.assertNotJSON(); diff --git a/src/js_parser.zig b/src/js_parser.zig index a61657128..2e6b9e336 100644 --- a/src/js_parser.zig +++ b/src/js_parser.zig @@ -13095,13 +13095,39 @@ fn NewParser_( return p.newExpr(E.BigInt{ .value = value }, loc); }, .t_slash, .t_slash_equals => { - try p.lexer.scanRegExp(); + const needs_decode = try p.lexer.scanRegExp(); // always set regex_flags_start to null to make sure we don't accidentally use the wrong value later defer p.lexer.regex_flags_start = null; - const value = p.lexer.raw(); + + const raw = p.lexer.raw(); + + if (!needs_decode) { + try p.lexer.next(); + return p.newExpr( + E.RegExp{ + .data = .{ + .raw = raw, + }, + .flags_offset = p.lexer.regex_flags_start, + }, + loc, + ); + } + + var buf = std.ArrayList(u16).initCapacity(p.allocator, raw.len) catch unreachable; + try p.lexer.decodeEscapeSequences(p.lexer.start, raw, @TypeOf(buf), &buf); + try p.lexer.next(); - return p.newExpr(E.RegExp{ .value = value, .flags_offset = p.lexer.regex_flags_start }, loc); + return p.newExpr( + E.RegExp{ + .data = .{ + .decoded = bun.BabyList(u16).init(buf.items), + }, + .flags_offset = p.lexer.regex_flags_start, + }, + loc, + ); }, .t_void => { try p.lexer.next(); diff --git a/src/js_printer.zig b/src/js_printer.zig index 57ef580b6..3caa5f11e 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -1437,7 +1437,7 @@ fn NewPrinter( ) catch unreachable; } - pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void { + fn printUTF16(e: *Printer, text: []const u16, comptime quoted: bool, quote: u8) void { var i: usize = 0; const n: usize = text.len; @@ -1471,26 +1471,42 @@ fn NewPrinter( e.print("\\x07"); }, 0x08 => { - if (quote == '`') - e.print(0x08) - else + if (comptime quoted) { + if (quote == '`') + e.print(0x08) + else + e.print("\\b"); + } else { e.print("\\b"); + } }, 0x0C => { - if (quote == '`') - e.print(0x000C) - else + if (comptime quoted) { + if (quote == '`') + e.print(0x000C) + else + e.print("\\f"); + } else { e.print("\\f"); + } }, '\t' => { - if (quote == '`') - e.print("\t") - else + if (comptime quoted) { + if (quote == '`') + e.print("\t") + else + e.print("\\t"); + } else { e.print("\\t"); + } }, '\n' => { - if (quote == '`') { - e.print('\n'); + if (comptime quoted) { + if (quote == '`') { + e.print('\n'); + } else { + e.print("\\n"); + } } else { e.print("\\n"); } @@ -1501,8 +1517,12 @@ fn NewPrinter( }, // \v std.ascii.control_code.vt => { - if (quote == '`') { - e.print(std.ascii.control_code.vt); + if (comptime quoted) { + if (quote == '`') { + e.print(std.ascii.control_code.vt); + } else { + e.print("\\v"); + } } else { e.print("\\v"); } @@ -1513,29 +1533,37 @@ fn NewPrinter( }, '\'' => { - if (quote == '\'') { - e.print('\\'); + if (comptime quoted) { + if (quote == '\'') { + e.print('\\'); + } } e.print("'"); }, '"' => { - if (quote == '"') { - e.print('\\'); + if (comptime quoted) { + if (quote == '"') { + e.print('\\'); + } } e.print("\""); }, '`' => { - if (quote == '`') { - e.print('\\'); + if (comptime quoted) { + if (quote == '`') { + e.print('\\'); + } } e.print("`"); }, '$' => { - if (quote == '`' and i < n and text[i] == '{') { - e.print('\\'); + if (comptime quoted) { + if (quote == '`' and i < n and text[i] == '{') { + e.print('\\'); + } } e.print('$'); @@ -1559,32 +1587,34 @@ fn NewPrinter( // this only applies to template literal strings // but we print a template literal if there is a \n or a \r // which is often if the string is long and UTF-16 - if (quote == '`') { - const remain = text[i..]; - if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and - remain[0] != '$' and - remain[0] != '\\' and - remain[0] != '`') - { - if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| { - if (count_ == 0) - unreachable; // conditional above checks this - - const len = count_ - 1; - i += len; - var ptr = e.writer.reserve(len) catch unreachable; - var to_copy = ptr[0..len]; - - strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]); - e.writer.advance(len); - continue :outer; - } else { - const count = @as(u32, @truncate(remain.len)); - var ptr = e.writer.reserve(count) catch unreachable; - var to_copy = ptr[0..count]; - strings.copyU16IntoU8(to_copy, []const u16, remain); - e.writer.advance(count); - i += count; + if (comptime quoted) { + if (quote == '`') { + const remain = text[i..]; + if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and + remain[0] != '$' and + remain[0] != '\\' and + remain[0] != '`') + { + if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| { + if (count_ == 0) + unreachable; // conditional above checks this + + const len = count_ - 1; + i += len; + var ptr = e.writer.reserve(len) catch unreachable; + var to_copy = ptr[0..len]; + + strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]); + e.writer.advance(len); + continue :outer; + } else { + const count = @as(u32, @truncate(remain.len)); + var ptr = e.writer.reserve(count) catch unreachable; + var to_copy = ptr[0..count]; + strings.copyU16IntoU8(to_copy, []const u16, remain); + e.writer.advance(count); + i += count; + } } } } @@ -1664,6 +1694,14 @@ fn NewPrinter( } } + pub fn printUnquotedUTF16(p: *Printer, text: []const u16) void { + p.printUTF16(text, false, 0); + } + + pub fn printQuotedUTF16(p: *Printer, text: []const u16, quote: u8) void { + p.printUTF16(text, true, quote); + } + pub fn isUnboundEvalIdentifier(p: *Printer, value: Expr) bool { switch (value.data) { .e_identifier => |ident| { @@ -3149,53 +3187,13 @@ fn NewPrinter( p.print(" "); } - if (comptime is_bun_platform) { - // Translate any non-ASCII to unicode escape sequences - var ascii_start: usize = 0; - var is_ascii = false; - var iter = CodepointIterator.init(e.value); - var cursor = CodepointIterator.Cursor{}; - while (iter.next(&cursor)) { - switch (cursor.c) { - first_ascii...last_ascii => { - if (!is_ascii) { - ascii_start = cursor.i; - is_ascii = true; - } - }, - else => { - if (is_ascii) { - p.print(e.value[ascii_start..cursor.i]); - is_ascii = false; - } - - switch (cursor.c) { - 0...0xFFFF => { - p.print([_]u8{ - '\\', - 'u', - hex_chars[cursor.c >> 12], - hex_chars[(cursor.c >> 8) & 15], - hex_chars[(cursor.c >> 4) & 15], - hex_chars[cursor.c & 15], - }); - }, - else => { - p.print("\\u{"); - std.fmt.formatInt(cursor.c, 16, .lower, .{}, p) catch unreachable; - p.print("}"); - }, - } - }, - } - } - - if (is_ascii) { - p.print(e.value[ascii_start..]); - } - } else { - // UTF8 sequence is fine - p.print(e.value); + switch (e.data) { + .raw => |raw| { + p.print(raw); + }, + .decoded => |decoded| { + p.printUnquotedUTF16(decoded.slice()); + }, } // Need a space before the next identifier to avoid it turning into flags diff --git a/test/transpiler/transpiler.test.js b/test/transpiler/transpiler.test.js index a6c2dcf73..c80a0670a 100644 --- a/test/transpiler/transpiler.test.js +++ b/test/transpiler/transpiler.test.js @@ -1973,6 +1973,11 @@ console.log(resolve.length) expectParseError("/x/msuygig", 'Duplicate flag "g" in regular expression'); }); + it("non-ascii regexp literals", () => { + var str = "🔴11 54 / 10,000"; + expect(str.replace(/[🔵🔴,]+/g, "")).toBe("11 54 / 10000"); + }); + it("identifier escapes", () => { expectPrinted_("var _\u0076\u0061\u0072", "var _var"); expectParseError("var \u0076\u0061\u0072", 'Expected identifier but found "\u0076\u0061\u0072"'); |