aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/js_ast.zig7
-rw-r--r--src/js_lexer.zig61
-rw-r--r--src/js_parser.zig32
-rw-r--r--src/js_printer.zig188
-rw-r--r--test/transpiler/transpiler.test.js5
5 files changed, 126 insertions, 167 deletions
diff --git a/src/js_ast.zig b/src/js_ast.zig
index d4a3b1d92..7811541f4 100644
--- a/src/js_ast.zig
+++ b/src/js_ast.zig
@@ -2538,10 +2538,7 @@ pub const E = struct {
};
pub const RegExp = struct {
- data: union(enum) {
- raw: string,
- decoded: bun.BabyList(u16),
- },
+ value: string,
// This exists for JavaScript bindings
// The RegExp constructor expects flags as a second argument.
@@ -2551,7 +2548,7 @@ pub const E = struct {
// ^
flags_offset: ?u16 = null,
- pub var empty = RegExp{ .data = .{ .raw = "" } };
+ pub var empty = RegExp{ .value = "" };
pub fn pattern(this: RegExp) string {
diff --git a/src/js_lexer.zig b/src/js_lexer.zig
index 47b6fdcb9..24484a02b 100644
--- a/src/js_lexer.zig
+++ b/src/js_lexer.zig
@@ -295,7 +295,7 @@ fn NewLexer_(
this.comments_to_preserve_before.clearAndFree();
}
- pub fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
+ fn decodeEscapeSequences(lexer: *LexerType, start: usize, text: string, comptime BufType: type, buf_: *BufType) !void {
var buf = buf_.*;
defer buf_.* = buf;
if (comptime is_json) lexer.is_ascii_only = false;
@@ -2075,11 +2075,9 @@ fn NewLexer_(
if (comptime is_json) unreachable;
}
- // returns true of the regex contents need to be decoded
- pub fn scanRegExp(lexer: *LexerType) !bool {
+ pub fn scanRegExp(lexer: *LexerType) !void {
lexer.assertNotJSON();
lexer.regex_flags_start = null;
- var decode = lexer.code_point >= 0x80;
while (true) {
switch (lexer.code_point) {
'/' => {
@@ -2123,48 +2121,20 @@ fn NewLexer_(
},
}
}
-
- return decode;
+ return;
},
'[' => {
lexer.step();
- if (lexer.code_point >= 0x80) decode = true;
while (lexer.code_point != ']') {
- try lexer.scanRegExpValidateAndStep(&decode);
+ try lexer.scanRegExpValidateAndStep();
}
lexer.step();
- if (lexer.code_point >= 0x80) decode = true;
},
else => {
- try lexer.scanRegExpValidateAndStep(&decode);
+ try lexer.scanRegExpValidateAndStep();
},
}
}
-
- return decode;
- }
-
- fn scanRegExpValidateAndStep(lexer: *LexerType, decode: *bool) !void {
- lexer.assertNotJSON();
-
- if (lexer.code_point == '\\') {
- lexer.step();
- if (lexer.code_point >= 0x80) decode.* = true;
- }
-
- switch (lexer.code_point) {
- '\r', '\n', 0x2028, 0x2029 => {
- // Newlines aren't allowed in regular expressions
- try lexer.syntaxError();
- },
- -1 => { // EOF
- try lexer.syntaxError();
- },
- else => {
- lexer.step();
- if (lexer.code_point >= 0x80) decode.* = true;
- },
- }
}
// TODO: use wtf-8 encoding.
@@ -2622,6 +2592,27 @@ fn NewLexer_(
try lexer.nextInsideJSXElement();
}
+ fn scanRegExpValidateAndStep(lexer: *LexerType) !void {
+ lexer.assertNotJSON();
+
+ if (lexer.code_point == '\\') {
+ lexer.step();
+ }
+
+ switch (lexer.code_point) {
+ '\r', '\n', 0x2028, 0x2029 => {
+ // Newlines aren't allowed in regular expressions
+ try lexer.syntaxError();
+ },
+ -1 => { // EOF
+ try lexer.syntaxError();
+ },
+ else => {
+ lexer.step();
+ },
+ }
+ }
+
pub fn rescanCloseBraceAsTemplateToken(lexer: *LexerType) !void {
lexer.assertNotJSON();
diff --git a/src/js_parser.zig b/src/js_parser.zig
index 2e6b9e336..a61657128 100644
--- a/src/js_parser.zig
+++ b/src/js_parser.zig
@@ -13095,39 +13095,13 @@ fn NewParser_(
return p.newExpr(E.BigInt{ .value = value }, loc);
},
.t_slash, .t_slash_equals => {
- const needs_decode = try p.lexer.scanRegExp();
+ try p.lexer.scanRegExp();
// always set regex_flags_start to null to make sure we don't accidentally use the wrong value later
defer p.lexer.regex_flags_start = null;
-
- const raw = p.lexer.raw();
-
- if (!needs_decode) {
- try p.lexer.next();
- return p.newExpr(
- E.RegExp{
- .data = .{
- .raw = raw,
- },
- .flags_offset = p.lexer.regex_flags_start,
- },
- loc,
- );
- }
-
- var buf = std.ArrayList(u16).initCapacity(p.allocator, raw.len) catch unreachable;
- try p.lexer.decodeEscapeSequences(p.lexer.start, raw, @TypeOf(buf), &buf);
-
+ const value = p.lexer.raw();
try p.lexer.next();
- return p.newExpr(
- E.RegExp{
- .data = .{
- .decoded = bun.BabyList(u16).init(buf.items),
- },
- .flags_offset = p.lexer.regex_flags_start,
- },
- loc,
- );
+ return p.newExpr(E.RegExp{ .value = value, .flags_offset = p.lexer.regex_flags_start }, loc);
},
.t_void => {
try p.lexer.next();
diff --git a/src/js_printer.zig b/src/js_printer.zig
index 3caa5f11e..57ef580b6 100644
--- a/src/js_printer.zig
+++ b/src/js_printer.zig
@@ -1437,7 +1437,7 @@ fn NewPrinter(
) catch unreachable;
}
- fn printUTF16(e: *Printer, text: []const u16, comptime quoted: bool, quote: u8) void {
+ pub fn printQuotedUTF16(e: *Printer, text: []const u16, quote: u8) void {
var i: usize = 0;
const n: usize = text.len;
@@ -1471,42 +1471,26 @@ fn NewPrinter(
e.print("\\x07");
},
0x08 => {
- if (comptime quoted) {
- if (quote == '`')
- e.print(0x08)
- else
- e.print("\\b");
- } else {
+ if (quote == '`')
+ e.print(0x08)
+ else
e.print("\\b");
- }
},
0x0C => {
- if (comptime quoted) {
- if (quote == '`')
- e.print(0x000C)
- else
- e.print("\\f");
- } else {
+ if (quote == '`')
+ e.print(0x000C)
+ else
e.print("\\f");
- }
},
'\t' => {
- if (comptime quoted) {
- if (quote == '`')
- e.print("\t")
- else
- e.print("\\t");
- } else {
+ if (quote == '`')
+ e.print("\t")
+ else
e.print("\\t");
- }
},
'\n' => {
- if (comptime quoted) {
- if (quote == '`') {
- e.print('\n');
- } else {
- e.print("\\n");
- }
+ if (quote == '`') {
+ e.print('\n');
} else {
e.print("\\n");
}
@@ -1517,12 +1501,8 @@ fn NewPrinter(
},
// \v
std.ascii.control_code.vt => {
- if (comptime quoted) {
- if (quote == '`') {
- e.print(std.ascii.control_code.vt);
- } else {
- e.print("\\v");
- }
+ if (quote == '`') {
+ e.print(std.ascii.control_code.vt);
} else {
e.print("\\v");
}
@@ -1533,37 +1513,29 @@ fn NewPrinter(
},
'\'' => {
- if (comptime quoted) {
- if (quote == '\'') {
- e.print('\\');
- }
+ if (quote == '\'') {
+ e.print('\\');
}
e.print("'");
},
'"' => {
- if (comptime quoted) {
- if (quote == '"') {
- e.print('\\');
- }
+ if (quote == '"') {
+ e.print('\\');
}
e.print("\"");
},
'`' => {
- if (comptime quoted) {
- if (quote == '`') {
- e.print('\\');
- }
+ if (quote == '`') {
+ e.print('\\');
}
e.print("`");
},
'$' => {
- if (comptime quoted) {
- if (quote == '`' and i < n and text[i] == '{') {
- e.print('\\');
- }
+ if (quote == '`' and i < n and text[i] == '{') {
+ e.print('\\');
}
e.print('$');
@@ -1587,34 +1559,32 @@ fn NewPrinter(
// this only applies to template literal strings
// but we print a template literal if there is a \n or a \r
// which is often if the string is long and UTF-16
- if (comptime quoted) {
- if (quote == '`') {
- const remain = text[i..];
- if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
- remain[0] != '$' and
- remain[0] != '\\' and
- remain[0] != '`')
- {
- if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
- if (count_ == 0)
- unreachable; // conditional above checks this
-
- const len = count_ - 1;
- i += len;
- var ptr = e.writer.reserve(len) catch unreachable;
- var to_copy = ptr[0..len];
-
- strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
- e.writer.advance(len);
- continue :outer;
- } else {
- const count = @as(u32, @truncate(remain.len));
- var ptr = e.writer.reserve(count) catch unreachable;
- var to_copy = ptr[0..count];
- strings.copyU16IntoU8(to_copy, []const u16, remain);
- e.writer.advance(count);
- i += count;
- }
+ if (quote == '`') {
+ const remain = text[i..];
+ if (remain.len > 1 and remain[0] < last_ascii and remain[0] > first_ascii and
+ remain[0] != '$' and
+ remain[0] != '\\' and
+ remain[0] != '`')
+ {
+ if (strings.@"nextUTF16NonASCIIOr$`\\"([]const u16, remain)) |count_| {
+ if (count_ == 0)
+ unreachable; // conditional above checks this
+
+ const len = count_ - 1;
+ i += len;
+ var ptr = e.writer.reserve(len) catch unreachable;
+ var to_copy = ptr[0..len];
+
+ strings.copyU16IntoU8(to_copy, []const u16, remain[0..len]);
+ e.writer.advance(len);
+ continue :outer;
+ } else {
+ const count = @as(u32, @truncate(remain.len));
+ var ptr = e.writer.reserve(count) catch unreachable;
+ var to_copy = ptr[0..count];
+ strings.copyU16IntoU8(to_copy, []const u16, remain);
+ e.writer.advance(count);
+ i += count;
}
}
}
@@ -1694,14 +1664,6 @@ fn NewPrinter(
}
}
- pub fn printUnquotedUTF16(p: *Printer, text: []const u16) void {
- p.printUTF16(text, false, 0);
- }
-
- pub fn printQuotedUTF16(p: *Printer, text: []const u16, quote: u8) void {
- p.printUTF16(text, true, quote);
- }
-
pub fn isUnboundEvalIdentifier(p: *Printer, value: Expr) bool {
switch (value.data) {
.e_identifier => |ident| {
@@ -3187,13 +3149,53 @@ fn NewPrinter(
p.print(" ");
}
- switch (e.data) {
- .raw => |raw| {
- p.print(raw);
- },
- .decoded => |decoded| {
- p.printUnquotedUTF16(decoded.slice());
- },
+ if (comptime is_bun_platform) {
+ // Translate any non-ASCII to unicode escape sequences
+ var ascii_start: usize = 0;
+ var is_ascii = false;
+ var iter = CodepointIterator.init(e.value);
+ var cursor = CodepointIterator.Cursor{};
+ while (iter.next(&cursor)) {
+ switch (cursor.c) {
+ first_ascii...last_ascii => {
+ if (!is_ascii) {
+ ascii_start = cursor.i;
+ is_ascii = true;
+ }
+ },
+ else => {
+ if (is_ascii) {
+ p.print(e.value[ascii_start..cursor.i]);
+ is_ascii = false;
+ }
+
+ switch (cursor.c) {
+ 0...0xFFFF => {
+ p.print([_]u8{
+ '\\',
+ 'u',
+ hex_chars[cursor.c >> 12],
+ hex_chars[(cursor.c >> 8) & 15],
+ hex_chars[(cursor.c >> 4) & 15],
+ hex_chars[cursor.c & 15],
+ });
+ },
+ else => {
+ p.print("\\u{");
+ std.fmt.formatInt(cursor.c, 16, .lower, .{}, p) catch unreachable;
+ p.print("}");
+ },
+ }
+ },
+ }
+ }
+
+ if (is_ascii) {
+ p.print(e.value[ascii_start..]);
+ }
+ } else {
+ // UTF8 sequence is fine
+ p.print(e.value);
}
// Need a space before the next identifier to avoid it turning into flags
diff --git a/test/transpiler/transpiler.test.js b/test/transpiler/transpiler.test.js
index c80a0670a..a6c2dcf73 100644
--- a/test/transpiler/transpiler.test.js
+++ b/test/transpiler/transpiler.test.js
@@ -1973,11 +1973,6 @@ console.log(resolve.length)
expectParseError("/x/msuygig", 'Duplicate flag "g" in regular expression');
});
- it("non-ascii regexp literals", () => {
- var str = "🔴11 54 / 10,000";
- expect(str.replace(/[🔵🔴,]+/g, "")).toBe("11 54 / 10000");
- });
-
it("identifier escapes", () => {
expectPrinted_("var _\u0076\u0061\u0072", "var _var");
expectParseError("var \u0076\u0061\u0072", 'Expected identifier but found "\u0076\u0061\u0072"');