diff options
author | 2022-07-15 21:07:27 -0700 | |
---|---|---|
committer | 2022-07-15 21:07:41 -0700 | |
commit | fd4a210b84da0c7e5f55ae31a7e8af805b81abaa (patch) | |
tree | 7f1a4e30cb1aa3e69309d61d396779592e311147 | |
parent | 9a7874a680dc8846628b11b923f0096e7d3dadfe (diff) | |
download | bun-fd4a210b84da0c7e5f55ae31a7e8af805b81abaa.tar.gz bun-fd4a210b84da0c7e5f55ae31a7e8af805b81abaa.tar.zst bun-fd4a210b84da0c7e5f55ae31a7e8af805b81abaa.zip |
[bun.js] Fix non-ascii latin1 string handling in console.log
Closes https://github.com/oven-sh/bun/issues/738
Closes https://github.com/oven-sh/bun/issues/737
-rw-r--r-- | src/bun.js/bindings/bindings.zig | 15 | ||||
-rw-r--r-- | src/bun.js/bindings/exports.zig | 86 | ||||
-rw-r--r-- | src/bun.js/javascript_core_c_api.zig | 4 | ||||
-rw-r--r-- | src/bun.js/webcore/response.zig | 2 | ||||
-rw-r--r-- | src/js_printer.zig | 61 | ||||
-rw-r--r-- | src/string_immutable.zig | 63 | ||||
-rw-r--r-- | test/bun.js/inspect.test.js | 17 |
7 files changed, 175 insertions, 73 deletions
diff --git a/src/bun.js/bindings/bindings.zig b/src/bun.js/bindings/bindings.zig index 7fd20081e..897b3c7b4 100644 --- a/src/bun.js/bindings/bindings.zig +++ b/src/bun.js/bindings/bindings.zig @@ -206,6 +206,12 @@ pub const ZigString = extern struct { return ZigString{ .ptr = slice_.ptr, .len = slice_.len }; } + pub fn init16(slice_: []const u16) ZigString { + var out = ZigString{ .ptr = std.mem.sliceAsBytes(slice_).ptr, .len = slice_.len }; + out.markUTF16(); + return out; + } + pub fn from(slice_: JSC.C.JSValueRef, ctx: JSC.C.JSContextRef) ZigString { return JSC.JSValue.fromRef(slice_).getZigString(ctx.ptr()); } @@ -242,7 +248,7 @@ pub const ZigString = extern struct { return shim.cppFn("toExternalU16", .{ ptr, len, global }); } - pub fn isUTF8(this: *ZigString) bool { + pub fn isUTF8(this: ZigString) bool { return (@ptrToInt(this.ptr) & (1 << 61)) != 0; } @@ -272,12 +278,17 @@ pub const ZigString = extern struct { } pub fn format(self: ZigString, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + if (self.isUTF8()) { + try writer.writeAll(self.slice()); + return; + } + if (self.is16Bit()) { try strings.formatUTF16(self.utf16Slice(), writer); return; } - try writer.writeAll(self.slice()); + try strings.formatLatin1(self.slice(), writer); } pub inline fn toRef(slice_: []const u8, global: *JSGlobalObject) C_API.JSValueRef { diff --git a/src/bun.js/bindings/exports.zig b/src/bun.js/bindings/exports.zig index e39d3ea12..e68204f9a 100644 --- a/src/bun.js/bindings/exports.zig +++ b/src/bun.js/bindings/exports.zig @@ -1393,6 +1393,7 @@ pub const ZigConsoleClient = struct { var slice = slice_; var i: u32 = 0; var len: u32 = @truncate(u32, slice.len); + var any_non_ascii = false; while (i < len) : (i += 1) { switch (slice[i]) { '%' => { @@ -1411,7 +1412,11 @@ pub const ZigConsoleClient = struct { // Flush everything up to the % const end = slice[0 .. i - 1]; - writer.writeAll(end); + if (!any_non_ascii) + writer.writeAll(end) + else + writer.writeLatin1(end); + any_non_ascii = false; slice = slice[@minimum(slice.len, i + 1)..]; i = 0; len = @truncate(u32, slice.len); @@ -1436,11 +1441,14 @@ pub const ZigConsoleClient = struct { break; if (slice[i] == '%') i += 2; }, + 128...255 => { + any_non_ascii = true; + }, else => {}, } } - if (slice.len > 0) writer.writeAll(slice); + if (slice.len > 0) writer.writeLatin1(slice); } pub fn WrappedWriter(comptime Writer: type) type { @@ -1451,9 +1459,32 @@ pub const ZigConsoleClient = struct { self.ctx.print(fmt, args) catch unreachable; } + pub fn writeLatin1(self: *@This(), buf: []const u8) void { + var remain = buf; + while (remain.len > 0) { + if (strings.firstNonASCII(remain)) |i| { + if (i > 0) self.ctx.writeAll(remain[0..i]) catch unreachable; + self.ctx.writeAll(&strings.latin1ToCodepointBytesAssumeNotASCII(remain[i])) catch unreachable; + remain = remain[i + 1 ..]; + } else { + break; + } + } + + self.ctx.writeAll(remain) catch unreachable; + } + pub inline fn writeAll(self: *@This(), buf: []const u8) void { self.ctx.writeAll(buf) catch unreachable; } + + pub inline fn writeString(self: *@This(), str: ZigString) void { + self.print("{}", .{str}); + } + + pub inline fn write16Bit(self: *@This(), input: []const u16) void { + strings.formatUTF16Type([]const u16, input, self.ctx) catch unreachable; + } }; } @@ -1594,18 +1625,18 @@ pub const ZigConsoleClient = struct { return; } - JSPrinter.writeJSONString(str.slice(), Writer, writer_, false) catch unreachable; + JSPrinter.writeJSONString(str.slice(), Writer, writer_, .latin1) catch unreachable; return; } - if (jsType == .RegExpObject) { + if (jsType == .RegExpObject and enable_ansi_colors) { writer.print(comptime Output.prettyFmt("<r><red>", enable_ansi_colors), .{}); } writer.print("{}", .{str}); - if (jsType == .RegExpObject) { + if (jsType == .RegExpObject and enable_ansi_colors) { writer.print(comptime Output.prettyFmt("<r>", enable_ansi_colors), .{}); } }, @@ -1629,11 +1660,7 @@ pub const ZigConsoleClient = struct { var description = value.getDescription(this.globalThis); if (description.len > 0) { - var slice = description.toSlice(default_allocator); - defer if (slice.allocated) slice.deinit(); - writer.print(comptime Output.prettyFmt("<r><cyan>Symbol<r><d>(<green>{}<r><d>)<r>", enable_ansi_colors), .{ - JSPrinter.formatJSONString(slice.slice()), - }); + writer.print(comptime Output.prettyFmt("<r><cyan>Symbol<r><d>(<green>{}<r><d>)<r>", enable_ansi_colors), .{description}); } else { writer.print(comptime Output.prettyFmt("<r><cyan>Symbol<r>", enable_ansi_colors), .{}); } @@ -1976,20 +2003,19 @@ pub const ZigConsoleClient = struct { print_children: { switch (tag.tag) { .String => { - var children_slice = children.toSlice(this.globalThis, default_allocator); - defer if (children_slice.allocated) children_slice.deinit(); - if (children_slice.len == 0) break :print_children; + var children_string = children.getZigString(this.globalThis); + if (children_string.len == 0) break :print_children; if (comptime enable_ansi_colors) writer.writeAll(comptime Output.prettyFmt("<r>", true)); writer.writeAll(">"); - if (children_slice.len < 128) { - writer.writeAll(children_slice.slice()); + if (children_string.len < 128) { + writer.writeString(children_string); } else { this.indent += 1; writer.writeAll("\n"); this.writeIndent(Writer, writer_) catch unreachable; this.indent -|= 1; - writer.writeAll(children_slice.slice()); + writer.writeString(children_string); writer.writeAll("\n"); this.writeIndent(Writer, writer_) catch unreachable; } @@ -2093,25 +2119,43 @@ pub const ZigConsoleClient = struct { defer CAPI.JSStringRelease(property_name_ref); const len = CAPI.JSStringGetLength(property_name_ref); if (len == 0) continue; - var prop = CAPI.JSStringGetCharacters8Ptr(property_name_ref)[0..len]; + const encoding = CAPI.JSStringEncoding(property_name_ref); var property_value = CAPI.JSObjectGetProperty(this.globalThis.ref(), object, property_name_ref, null); const tag = Tag.get(JSValue.fromRef(property_value), this.globalThis); if (tag.cell.isHidden()) continue; - const key = prop[0..@minimum(prop.len, 128)]; + const key: ZigString = if (encoding == .char8) + ZigString.init((JSC.C.JSStringGetCharacters8Ptr(property_name_ref))[0..@minimum(len, 128)]) + else + ZigString.init16(JSC.C.JSStringGetCharactersPtr(property_name_ref)[0..@minimum(len, 128)]); // TODO: make this one pass? - if (JSLexer.isLatin1Identifier(@TypeOf(key), key)) { + if (!key.is16Bit() and JSLexer.isLatin1Identifier(@TypeOf(key.slice()), key.slice())) { writer.print( - comptime Output.prettyFmt("{s}<d>:<r> ", enable_ansi_colors), + comptime Output.prettyFmt("{}<d>:<r> ", enable_ansi_colors), .{key}, ); + } else if (key.is16Bit()) { + var utf16Slice = key.utf16SliceAligned(); + writer.writeAll("\""); + + while (strings.indexOfAny16(utf16Slice, "\"")) |j| { + writer.write16Bit(utf16Slice[0..j]); + writer.writeAll("\\\""); + utf16Slice = utf16Slice[j + 1 ..]; + } + + writer.write16Bit(utf16Slice); + writer.print( + comptime Output.prettyFmt("\"<d>:<r> ", enable_ansi_colors), + .{}, + ); } else { writer.print( comptime Output.prettyFmt("{s}<d>:<r> ", enable_ansi_colors), - .{JSPrinter.formatJSONString(key)}, + .{JSPrinter.formatJSONString(key.slice())}, ); } diff --git a/src/bun.js/javascript_core_c_api.zig b/src/bun.js/javascript_core_c_api.zig index 82145defc..00f35c45a 100644 --- a/src/bun.js/javascript_core_c_api.zig +++ b/src/bun.js/javascript_core_c_api.zig @@ -244,13 +244,13 @@ pub extern "c" fn JSContextGetGroup(ctx: JSContextRef) JSContextGroupRef; pub extern "c" fn JSContextGetGlobalContext(ctx: JSContextRef) JSGlobalContextRef; pub extern "c" fn JSGlobalContextCopyName(ctx: JSGlobalContextRef) JSStringRef; pub extern "c" fn JSGlobalContextSetName(ctx: JSGlobalContextRef, name: JSStringRef) void; -pub const JSChar = c_ushort; +pub const JSChar = u16; pub extern fn JSStringCreateWithCharacters(chars: [*c]const JSChar, numChars: usize) JSStringRef; pub extern fn JSStringCreateWithUTF8CString(string: [*c]const u8) JSStringRef; pub extern fn JSStringRetain(string: JSStringRef) JSStringRef; pub extern fn JSStringRelease(string: JSStringRef) void; pub extern fn JSStringGetLength(string: JSStringRef) usize; -pub extern fn JSStringGetCharactersPtr(string: JSStringRef) [*c]const JSChar; +pub extern fn JSStringGetCharactersPtr(string: JSStringRef) [*]const JSChar; pub extern fn JSStringGetMaximumUTF8CStringSize(string: JSStringRef) usize; pub extern fn JSStringGetUTF8CString(string: JSStringRef, buffer: [*c]u8, bufferSize: usize) usize; pub extern fn JSStringIsEqual(a: JSStringRef, b: JSStringRef) bool; diff --git a/src/bun.js/webcore/response.zig b/src/bun.js/webcore/response.zig index 6171468b1..ab567c75a 100644 --- a/src/bun.js/webcore/response.zig +++ b/src/bun.js/webcore/response.zig @@ -208,7 +208,7 @@ pub const Response = struct { try formatter.writeIndent(Writer, writer); try writer.writeAll("statusText: "); - try JSPrinter.writeJSONString(this.status_text, Writer, writer, false); + try JSPrinter.writeJSONString(this.status_text, Writer, writer, .ascii); formatter.printComma(Writer, writer, enable_ansi_colors) catch unreachable; try writer.writeAll("\n"); diff --git a/src/js_printer.zig b/src/js_printer.zig index 945e0a382..e6f31aa85 100644 --- a/src/js_printer.zig +++ b/src/js_printer.zig @@ -226,30 +226,45 @@ const JSONFormatter = struct { input: []const u8, pub fn format(self: JSONFormatter, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { - try writeJSONString(self.input, @TypeOf(writer), writer, false); + try writeJSONString(self.input, @TypeOf(writer), writer, .latin1); } }; +/// Expects latin1 pub fn formatJSONString(text: []const u8) JSONFormatter { return JSONFormatter{ .input = text }; } -pub fn writeJSONString(text: []const u8, comptime Writer: type, writer: Writer, comptime ascii_only: bool) !void { +pub fn writeJSONString(input: []const u8, comptime Writer: type, writer: Writer, comptime encoding: strings.Encoding) !void { try writer.writeAll("\""); - var i: usize = 0; - var n: usize = text.len; - while (i < n) { - const width = strings.wtf8ByteSequenceLengthWithInvalid(text[i]); - const c = strings.decodeWTF8RuneT(text.ptr[i .. i + 4][0..4], width, i32, 0); - if (canPrintWithoutEscape(i32, c, ascii_only)) { - const remain = text[i + @as(usize, width) ..]; + var text = input; + const end = text.ptr + text.len; + if (comptime encoding == .utf16) { + @compileError("not implemented yet"); + } + + while (text.ptr != end) { + const width = if (comptime encoding == .latin1 or encoding == .ascii) + 1 + else + strings.wtf8ByteSequenceLengthWithInvalid(text[0]); + + const c: i32 = if (comptime encoding == .utf8) + strings.decodeWTF8RuneT(text.ptr[0..4], width, i32, 0) + else brk: { + const char = text[0]; + if (char <= 0x7F) { + break :brk char; + } else break :brk strings.latin1ToCodepointAssumeNotASCII(char, i32); + }; + if (canPrintWithoutEscape(i32, c, false)) { + const remain = text[@as(usize, width)..]; if (strings.indexOfNeedsEscape(remain)) |j| { - try writer.writeAll(text[i .. i + j + @as(usize, width)]); - i += j + @as(usize, width); + try writer.writeAll(text[0 .. j + @as(usize, width)]); + text = text[j + @as(usize, width) ..]; continue; } else { - try writer.writeAll(text[i..]); - i = n; + try writer.writeAll(text); break; } } @@ -260,46 +275,46 @@ pub fn writeJSONString(text: []const u8, comptime Writer: type, writer: Writer, // allowed in strict mode (or in template strings). 0x07 => { try writer.writeAll("\\x07"); - i += 1; + text = text[1..]; }, 0x08 => { try writer.writeAll("\\b"); - i += 1; + text = text[1..]; }, 0x0C => { try writer.writeAll("\\f"); - i += 1; + text = text[1..]; }, '\n' => { try writer.writeAll("\\n"); - i += 1; + text = text[1..]; }, std.ascii.control_code.CR => { try writer.writeAll("\\r"); - i += 1; + text = text[1..]; }, // \v std.ascii.control_code.VT => { try writer.writeAll("\\v"); - i += 1; + text = text[1..]; }, // "\\" '\\' => { try writer.writeAll("\\\\"); - i += 1; + text = text[1..]; }, '"' => { try writer.writeAll("\\\""); - i += 1; + text = text[1..]; }, '\t' => { try writer.writeAll("\\t"); - i += 1; + text = text[1..]; }, else => { - i += @as(usize, width); + text = text[@as(usize, width)..]; if (c < 0xFFFF) { const k = @intCast(usize, c); diff --git a/src/string_immutable.zig b/src/string_immutable.zig index 479342025..ec3a2ecbe 100644 --- a/src/string_immutable.zig +++ b/src/string_immutable.zig @@ -7,6 +7,14 @@ const CodePoint = @import("string_types.zig").CodePoint; const bun = @import("global.zig"); pub const joiner = @import("./string_joiner.zig"); const assert = std.debug.assert; + +pub const Encoding = enum { + ascii, + utf8, + latin1, + utf16, +}; + pub inline fn containsChar(self: string, char: u8) bool { return indexOfChar(self, char) != null; } @@ -2189,8 +2197,7 @@ pub fn copyUTF16IntoUTF8(buf: []u8, comptime Type: type, utf16: Type) EncodeInto while (firstNonASCII16(Type, utf16_remaining)) |i| { const end = @minimum(i, remaining.len); - const to_copy = utf16_remaining[0..end]; - copyU16IntoU8(remaining, Type, to_copy); + if (end > 0) copyU16IntoU8(remaining, Type, utf16_remaining[0..end]); remaining = remaining[end..]; utf16_remaining = utf16_remaining[end..]; @@ -3133,35 +3140,43 @@ test "firstNonASCII16" { } } -pub fn formatUTF16(slice_: []align(1) const u16, writer: anytype) !void { +pub fn formatUTF16Type(comptime Slice: type, slice_: Slice, writer: anytype) !void { var slice = slice_; - var chunk: [512 + 4]u8 = undefined; - var chunk_i: u16 = 0; + const chunk_size = 2048; + var chunk: [chunk_size + 4]u8 = undefined; while (slice.len > 0) { - if (chunk_i >= chunk.len - 5) { - try writer.writeAll(chunk[0..chunk_i]); - chunk_i = 0; - } + const result = strings.copyUTF16IntoUTF8(&chunk, Slice, slice); + if (result.read == 0 or result.written == 0) + break; + try writer.writeAll(chunk[0..result.written]); + slice = slice[result.read..]; + } +} - var cp: u32 = slice[0]; - slice = slice[1..]; - if (cp & ~@as(u32, 0x03ff) == 0xd800 and slice.len > 0) { - cp = 0x10000 + (((cp & 0x03ff) << 10) | (slice[0] & 0x03ff)); - slice = slice[1..]; - } +pub fn formatUTF16(slice_: []align(1) const u16, writer: anytype) !void { + return formatUTF16Type([]align(1) const u16, slice_, writer); +} - chunk_i += @as( - u8, - @call( - .{ .modifier = .always_inline }, - encodeWTF8RuneT, - .{ chunk[chunk_i..][0..4], u32, cp }, - ), - ); +pub fn formatLatin1(slice_: []const u8, writer: anytype) !void { + var slice = slice_; + const chunk_size = 2048; + var chunk: [chunk_size + 4]u8 = undefined; + + while (strings.firstNonASCII(slice)) |i| { + if (i > 0) { + try writer.writeAll(slice[0..i]); + slice = slice[i..]; + } + const result = strings.copyLatin1IntoUTF8(&chunk, @TypeOf(slice), slice[0..@minimum(chunk.len, slice.len)]); + if (result.read == 0 or result.written == 0) + break; + try writer.writeAll(chunk[0..result.written]); + slice = slice[result.read..]; } - try writer.writeAll(chunk[0..chunk_i]); + if (slice.len > 0) + try writer.writeAll(slice); // write the remaining bytes } test "print UTF16" { diff --git a/test/bun.js/inspect.test.js b/test/bun.js/inspect.test.js index aa38f5c2d..138b395db 100644 --- a/test/bun.js/inspect.test.js +++ b/test/bun.js/inspect.test.js @@ -1,5 +1,22 @@ import { it, expect } from "bun:test"; +it("utf16 property name", () => { + var { Database } = require("bun:sqlite"); + const db = Database.open(":memory:"); + expect(Bun.inspect(db.prepare("select '😀' as 笑").all())).toBe( + '[ { "笑": "😀" } ]' + ); +}); + +it("latin1", () => { + expect(Bun.inspect("English")).toBe("English"); + expect(Bun.inspect("Français")).toBe("Français"); + expect(Bun.inspect("Ελληνική")).toBe("Ελληνική"); + expect(Bun.inspect("日本語")).toBe("日本語"); + expect(Bun.inspect("Emoji😎")).toBe("Emoji😎"); + expect(Bun.inspect("Français / Ελληνική")).toBe("Français / Ελληνική"); +}); + it("Request object", () => { expect(Bun.inspect(new Request({ url: "https://example.com" })).trim()).toBe( ` |