diff options
author | 2022-03-05 02:56:29 -0800 | |
---|---|---|
committer | 2022-03-05 02:56:29 -0800 | |
commit | af6c18ccc510e582987bc04b5187337b2263d6f6 (patch) | |
tree | 4103be915c0f75acd98494655edb29bb06626970 | |
parent | 4f5aa438d6799f24c7a112ca8dc0910466944f2e (diff) | |
download | bun-af6c18ccc510e582987bc04b5187337b2263d6f6.tar.gz bun-af6c18ccc510e582987bc04b5187337b2263d6f6.tar.zst bun-af6c18ccc510e582987bc04b5187337b2263d6f6.zip |
Begin implementing sourcemaps
-rw-r--r-- | src/sourcemap/sourcemap.zig | 554 |
1 files changed, 554 insertions, 0 deletions
diff --git a/src/sourcemap/sourcemap.zig b/src/sourcemap/sourcemap.zig new file mode 100644 index 000000000..ccd5561c8 --- /dev/null +++ b/src/sourcemap/sourcemap.zig @@ -0,0 +1,554 @@ +pub const VLQ_BASE_SHIFT: u32 = 5; +pub const VLQ_BASE: u32 = 1 << VLQ_BASE_SHIFT; +pub const VLQ_BASE_MASK: u32 = VLQ_BASE - 1; +pub const VLQ_CONTINUATION_BIT: u32 = VLQ_BASE; +pub const VLQ_CONTINUATION_MASK: u32 = 1 << VLQ_CONTINUATION_BIT; +const std = @import("std"); +const JSAst = @import("../js_ast.zig"); +const BabyList = JSAst.BabyList; +const Logger = @import("../logger.zig"); +const strings = @import("../string_immutable.zig"); +const MutableString = @import("../string_mutable.zig").MutableString; +const base64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +const base64_lut: [std.math.maxInt(u8)]u8 = brk: { + @setEvalBranchQuota(9999); + var bytes = [_]u8{255} ** std.math.maxInt(u8); + + for (base64) |c, i| { + bytes[c] = i; + } + + break :brk bytes; +}; +const SourceMap = @This(); + +// One VLQ value never exceeds 20 bits of data, so 15 is more than enough +pub const VLQ = struct { + bytes: [15]u8, + len: u8 = 0, +}; + +/// Coordinates in source maps are stored using relative offsets for size +/// reasons. When joining together chunks of a source map that were emitted +/// in parallel for different parts of a file, we need to fix up the first +/// segment of each chunk to be relative to the end of the previous chunk. +pub const SourceMapState = struct { + /// This isn't stored in the source map. It's only used by the bundler to join + /// source map chunks together correctly. + generated_line: i32 = 0, + + /// These are stored in the source map in VLQ format. + generated_column: i32 = 0, + source_index: i32 = 0, + original_line: i32 = 0, + original_column: i32 = 0, +}; + +sources: [][]const u8 = &[_][]u8{}, +sources_content: [][]SourceContent, +mapping: std.ArrayListUnmanaged(Mapping) = .{}, +allocator: std.mem.Allocator, + +pub const Mapping = extern struct { + contents: [5]i32 = undefined, + + pub inline fn generatedLine(mapping: Mapping) i32 { + return mapping.contents[0]; + } + + pub inline fn generatedColumn(mapping: Mapping) i32 { + return mapping.contents[1]; + } + + pub inline fn sourceIndex(mapping: Mapping) i32 { + return mapping.contents[2]; + } + + pub inline fn originalLine(mapping: Mapping) i32 { + return mapping.contents[3]; + } + + pub inline fn originalColumn(mapping: Mapping) i32 { + return mapping.contents[4]; + } +}; + +pub const LineColumnOffset = struct { + lines: i32 = 0, + columns: i32 = 0, +}; + +pub const SourceContent = struct { + value: []const u16 = &[_]u16{}, + quoted: []const u8 = &[_]u8{}, +}; + +pub fn find( + this: *const SourceMap, + line: i32, + column: i32, +) ?*const Mapping { + _ = this; + _ = line; + _ = column; +} + +// A single base 64 digit can contain 6 bits of data. For the base 64 variable +// length quantities we use in the source map spec, the first bit is the sign, +// the next four bits are the actual value, and the 6th bit is the continuation +// bit. The continuation bit tells us whether there are more digits in this +// value following this digit. +// +// Continuation +// | Sign +// | | +// V V +// 101011 +// +pub fn encodeVLQ( + value: i32, +) VLQ { + var vlq: i32 = 0; + var bytes = std.mem.zeroes([15]u8); + var len: u8 = 0; + if (value < 0) { + vlq = ((-value) << 1) | 1; + } else { + vlq = value << 1; + } + + if ((vlq >> 5) == 0) { + const digit = @intCast(u8, vlq & 31); + bytes[0] = base64[digit]; + return .{ .bytes = bytes, .len = 1 }; + } + + // i32 contains 32 bits + // since + // the maximum possible number is @maxInt(u20) + // + while (true) { + var digit = @intCast(u8, vlq & 31); + vlq >>= 5; + + // If there are still more digits in this value, we must make sure the + // continuation bit is marked + if (vlq != 0) { + digit |= 32; + } + + bytes[len] = base64[digit]; + len += 1; + + if (vlq == 0) { + break; + } + } + + return .{ .bytes = bytes, .len = len }; +} + +pub const VLQResult = struct { + value: i32 = 0, + start: usize = 0, +}; + +fn decodeVLQ(encoded: []const u8, start: usize) VLQResult { + var shift: i32 = 0; + var vlq: i32 = 0; + var len: usize = encoded.len; + var i: usize = start; + + while (i < len) { + const index = @as(i32, base64_lut[encoded[i]]); + if (index == std.math.maxInt(u8)) break; + + // decode a byte + vlq |= (index & 31) << shift; + i += 1; + shift += 5; + + // Stop if there's no continuation bit + if ((index & 32) == 0) { + break; + } + } + + var value = vlq >> 1; + if ((vlq & 1) != 0) { + value = -value; + } + + return VLQResult{ .start = i, .value = value }; +} + +pub const LineOffsetTable = struct { + + /// The source map specification is very loose and does not specify what + /// column numbers actually mean. The popular "source-map" library from Mozilla + /// appears to interpret them as counts of UTF-16 code units, so we generate + /// those too for compatibility. + /// + /// We keep mapping tables around to accelerate conversion from byte offsets + /// to UTF-16 code unit counts. However, this mapping takes up a lot of memory + /// and takes up a lot of memory. Since most JavaScript is ASCII and the + /// mapping for ASCII is 1:1, we avoid creating a table for ASCII-only lines + /// as an optimization. + /// + columns_for_non_ascii: BabyList(i32) = .{}, + byte_offset_to_first_non_ascii: u32 = 0, + byte_offset_to_start_of_line: u32 = 0, + + pub const List = std.MultiArrayList(LineOffsetTable); + + pub fn findLine(list: List, loc: Logger.Loc) i32 { + const byte_offsets_to_start_of_line = list.items(.byte_offset_to_start_of_line); + var original_line: i32 = 0; + if (loc.start <= -1) { + return 0; + } + + const loc_start = @intCast(u32, loc.start); + + { + var count = @truncate(u32, byte_offsets_to_start_of_line.len); + var i: u32 = 0; + while (count > 0) { + const step = count / 2; + i = original_line + step; + if (byte_offsets_to_start_of_line[i] <= loc_start) { + original_line = @intCast(i32, i + 1); + count = count - step - 1; + } else { + count = step; + } + } + } + + return original_line - 1; + } + + pub fn generate(allocator: std.mem.Allocator, contents: []const u8, approximate_line_count: i32) List { + var list = List{}; + // Preallocate the top-level table using the approximate line count from the lexer + list.ensureUnusedCapacity(allocator, @intCast(usize, @maximum(approximate_line_count, 1))) catch unreachable; + var column: i32 = 0; + var byte_offset_to_first_non_ascii: u32 = 0; + var column_byte_offset: u32 = 0; + var line_byte_offset: u32 = 0; + + // the idea here is: + // we want to avoid re-allocating this array _most_ of the time + // when lines _do_ have unicode characters, they probably still won't be longer than 255 much + var stack_fallback = std.heap.stackFallback(@sizeOf(i32) * 256, allocator); + var columns_for_non_ascii = std.ArrayList(i32).initCapacity(stack_fallback.get(), 120) catch unreachable; + const reset_end_index = stack_fallback.fixed_buffer_allocator.end_index; + const columns_for_non_ascii_reset = columns_for_non_ascii; + + var remaining = contents; + while (remaining.len > 0) { + // TODO: SIMD + const len_ = strings.wtf8ByteSequenceLength(remaining[0]); + const c = strings.decodeWTF8RuneT(remaining.ptr[0..4], len_, i32, 0); + + if (column == 0) { + line_byte_offset = @truncate( + u32, + @ptrToInt(remaining.ptr) - @ptrToInt(contents.ptr), + ); + } + + if (c > 0x7F and columns_for_non_ascii.items.len == 0) { + // reset the buffers + columns_for_non_ascii = columns_for_non_ascii_reset; + stack_fallback.fixed_buffer_allocator.reset(); + stack_fallback.fixed_buffer_allocator.end_index = reset_end_index; + + // we have a non-ASCII character, so we need to keep track of the + // mapping from byte offsets to UTF-16 code unit counts + columns_for_non_ascii.appendAssumeCapacity(column); + column_byte_offset = @intCast( + u32, + (@ptrToInt( + remaining.ptr, + ) - @ptrToInt( + contents.ptr, + )) - line_byte_offset, + ); + byte_offset_to_first_non_ascii = line_byte_offset; + } + + // Update the per-byte column offsets + if (columns_for_non_ascii.items.len > 0) { + const line_bytes_so_far = @intCast(u32, @truncate( + u32, + @ptrToInt(remaining.ptr) - @ptrToInt(contents.ptr), + )) - line_byte_offset; + try columns_for_non_ascii.ensureUnusedCapacity((line_bytes_so_far - column_byte_offset) + 1); + while (column_byte_offset <= line_bytes_so_far) : (column_byte_offset += 1) { + columns_for_non_ascii.appendAssumeCapacity(column); + } + } + + switch (c) { + '\r', '\n', 0x2028, 0x2029 => { + // windows newline + if (c == '\r' and remaining.len > 1 and remaining[1] == '\n') { + column += 1; + remaining = remaining[1..]; + continue; + } + var columns_list = columns_for_non_ascii; + if (columns_for_non_ascii.items.len > 0 and stack_fallback.fixed_buffer_allocator.ownsSlice(columns_for_non_ascii.items)) { + columns_for_non_ascii.items = try allocator.dupe(i32, columns_for_non_ascii.toOwnedSlice()); + columns_for_non_ascii.capacity = columns_for_non_ascii.items.len; + } + + try list.append(allocator, .{ + .byte_offset_to_start_of_line = line_byte_offset, + .byte_offset_to_first_non_ascii = byte_offset_to_first_non_ascii, + .columns_for_non_ascii = BabyList(i32).fromList(columns_list), + }); + column = 0; + byte_offset_to_first_non_ascii = 0; + column_byte_offset = 0; + line_byte_offset = 0; + columns_for_non_ascii = columns_for_non_ascii_reset; + stack_fallback.fixed_buffer_allocator.reset(); + stack_fallback.fixed_buffer_allocator.end_index = reset_end_index; + }, + else => { + // Mozilla's "source-map" library counts columns using UTF-16 code units + column += @as(i32, @boolToInt(c > 0xFFFF)) + 1; + }, + } + + remaining = remaining[len_..]; + } + + // Mark the start of the next line + if (column == 0) { + line_byte_offset = @intCast(i32, contents.len); + } + + if (columns_for_non_ascii.items.len > 0) { + const line_bytes_so_far = @intCast(u32, contents.len) - line_byte_offset; + try columns_for_non_ascii.ensureUnusedCapacity((line_bytes_so_far - column_byte_offset) + 1); + while (column_byte_offset <= line_bytes_so_far) : (column_byte_offset += 1) { + columns_for_non_ascii.appendAssumeCapacity(column); + } + } + + { + var columns_list = columns_for_non_ascii; + if (columns_for_non_ascii.items.len > 0 and stack_fallback.fixed_buffer_allocator.ownsSlice(columns_for_non_ascii.items)) { + columns_for_non_ascii.items = try allocator.dupe(i32, columns_for_non_ascii.toOwnedSlice()); + columns_for_non_ascii.capacity = columns_for_non_ascii.items.len; + } + + try list.append(allocator, .{ + .byte_offset_to_start_of_line = line_byte_offset, + .byte_offset_to_first_non_ascii = byte_offset_to_first_non_ascii, + .columns_for_non_ascii = BabyList(i32).fromList(columns_list), + }); + } + + return list; + } +}; + +pub fn appendMappingToBuffer(buffer_: MutableString, last_byte: u8, prev_state: SourceMapState, current_state: SourceMapState) MutableString { + var buffer = buffer_; + const needs_comma = last_byte != 0 and last_byte != ';' and last_byte != '"'; + + const vlq = [_]VLQ{ + // Record the generated column (the line is recorded using ';' elsewhere) + encodeVLQ(current_state.generated_column - prev_state.generated_column), + // Record the generated source + encodeVLQ(current_state.source_index - prev_state.source_index), + // Record the original line + encodeVLQ(current_state.original_line - prev_state.original_line), + // Record the original column + encodeVLQ(current_state.original_column - prev_state.original_column), + }; + + // Count exactly how many bytes we need to write + const total_len = @as(u32, vlq[0].len) + + @as(u32, vlq[1].len) + + @as(u32, vlq[2].len) + + @as(u32, vlq[3].len); + buffer.growIfNeeded(total_len + @as(u32, @boolToInt(needs_comma))) catch unreachable; + + // Put commas in between mappings + if (needs_comma) { + buffer.appendCharAssumeCapacity(','); + } + + inline for (vlq) |*item| { + buffer.appendAssumeCapacity(item.bytes[0..item.len]); + } + + return buffer; +} + +pub const Chunk = struct { + buffer: MutableString, + + /// This end state will be used to rewrite the start of the following source + /// map chunk so that the delta-encoded VLQ numbers are preserved. + end_state: SourceMapState = .{}, + + /// There probably isn't a source mapping at the end of the file (nor should + /// there be) but if we're appending another source map chunk after this one, + /// we'll need to know how many characters were in the last line we generated. + final_generated_column: i32 = 0, + + /// ignore empty chunks + should_ignore: bool = true, + + pub const Builder = struct { + input_source_map: ?*SourceMap = null, + source_map: MutableString, + line_offset_tables: LineOffsetTable.List = .{}, + prev_state: SourceMapState = SourceMapState{}, + last_generated_update: u32 = 0, + generated_column: i32 = 0, + prev_loc: Logger.Loc = Logger.Loc.Empty, + has_prev_state: bool = false, + + // This is a workaround for a bug in the popular "source-map" library: + // https://github.com/mozilla/source-map/issues/261. The library will + // sometimes return null when querying a source map unless every line + // starts with a mapping at column zero. + // + // The workaround is to replicate the previous mapping if a line ends + // up not starting with a mapping. This is done lazily because we want + // to avoid replicating the previous mapping if we don't need to. + line_starts_with_mapping: bool = false, + cover_lines_without_mappings: bool = false, + + pub fn generateChunk(b: *Builder, output: []const u8) Chunk { + b.updateGeneratedLineAndColumn(output); + return Chunk{ + .buffer = b.source_map, + .end_state = b.prev_state, + .final_generated_column = b.generated_column, + .should_ignore = strings.containsAnyBesidesChar(b.source_map.list.items, ';'), + }; + } + + // Scan over the printed text since the last source mapping and update the + // generated line and column numbers + pub fn updateGeneratedLineAndColumn(b: *Builder, output: []const u8) void { + const slice = output[b.last_generated_update..]; + + var iter = strings.CodepointIterator.init(slice); + var cursor = strings.CodepointIterator.Cursor{}; + while (iter.next(&cursor)) { + switch (cursor.c) { + '\r', '\n', 0x2028, 0x2029 => { + // windows newline + if (cursor.c == '\r') { + const newline_check = b.last_generated_update + cursor.i + 1; + if (newline_check < output.len and output[newline_check] == '\n') + continue; + } + + // If we're about to move to the next line and the previous line didn't have + // any mappings, add a mapping at the start of the previous line. + if (b.cover_lines_without_mappings and !b.line_starts_with_mapping and b.has_prev_state) { + b.appendMappingWithoutRemapping(.{ + .generated_line = b.prev_state.generated_line, + .generated_column = 0, + .source_index = b.prev_state.source_index, + .original_line = b.prev_state.original_line, + .original_column = b.prev_state.original_column, + }); + } + + b.prev_state.generated_line += 1; + b.prev_state.generated_column = 0; + b.generated_column = 0; + b.source_map.appendChar(';') catch unreachable; + + // This new line doesn't have a mapping yet + b.line_starts_with_mapping = false; + }, + else => { + // Mozilla's "source-map" library counts columns using UTF-16 code units + b.generated_column += @as(i32, @boolToInt(cursor.c > 0xFFFF)) + 1; + }, + } + } + + b.last_generated_update = @truncate(u32, output.len); + } + + pub fn appendMapping(b: *Builder, current_state_: SourceMapState) void { + var current_state = current_state_; + // If the input file had a source map, map all the way back to the original + if (b.input_source_map) |input| { + if (input.find(current_state.original_line, current_state.original_column)) |mapping| { + current_state.source_index = mapping.sourceIndex(); + current_state.original_line = mapping.originalLine(); + current_state.original_column = mapping.originalColumn(); + } + } + + b.appendMappingWithoutRemapping(current_state); + } + + pub fn appendMappingWithoutRemapping(b: *Builder, current_state: SourceMapState) void { + const last_byte: u8 = if (b.source_map.list.items.len > 0) + b.source_map.list.items[b.source_map.list.items.len - 1] + else + 0; + + b.source_map = appendMappingToBuffer(b.source_map, last_byte, b.prev_state, current_state); + b.prev_state = current_state; + b.has_prev_state = true; + } + + pub fn addSourceMapping(b: *Builder, loc: Logger.Loc, output: []const u8) void { + if (b.prev_loc.eql(loc)) { + return; + } + + b.prev_loc = loc; + const list = b.line_offset_tables; + const original_line = LineOffsetTable.findLine(list, loc); + const line = list.get(@intCast(usize, @maximum(original_line, 0))); + + // Use the line to compute the column + var original_column = loc.start - @intCast(i32, line.byte_offset_to_start_of_line); + if (line.columns_for_non_ascii.len > 0 and original_column >= @intCast(i32, line.byte_offset_to_first_non_ascii)) { + original_column = line.columns_for_non_ascii.slice()[@intCast(u32, original_column) - line.byte_offset_to_first_non_ascii]; + } + + b.updateGeneratedLineAndColumn(output); + + // If this line doesn't start with a mapping and we're about to add a mapping + // that's not at the start, insert a mapping first so the line starts with one. + if (b.cover_lines_without_mappings and !b.line_starts_with_mapping and b.generated_column > 0 and b.has_prev_state) { + b.appendMappingWithoutRemapping(.{ + .generated_line = b.prev_state.generated_line, + .generated_column = 0, + .source_index = b.prev_state.source_index, + .original_line = b.prev_state.original_line, + .original_column = b.prev_state.original_column, + }); + } + + b.appendMapping(.{ + .generated_line = b.prev_state.generated_line, + .generated_column = b.generated_column, + .source_index = b.prev_state.source_index, + .original_line = original_line, + .original_column = b.prev_state.original_column, + }); + + // This line now has a mapping on it, so don't insert another one + b.line_starts_with_mapping = true; + } + }; +}; |