107 files changed, 38215 insertions, 27851 deletions
diff --git a/src/bun.js/WebKit b/src/bun.js/WebKit
-Subproject 4c8ab8fdfb102522fdd8e55d4eea53e8ce2755c
+Subproject 26c819733315f0ab64ae8e8e65b77d77d31211e
diff --git a/src/bun.js/api/JSBundler.zig b/src/bun.js/api/JSBundler.zig
index 8e85f1190..44ceaee9d 100644
--- a/src/bun.js/api/JSBundler.zig
+++ b/src/bun.js/api/JSBundler.zig
@@ -26,7 +26,7 @@ const strings = bun.strings;
 const NewClass = Base.NewClass;
 const To = Base.To;
 const Request = WebCore.Request;
-
+const String = bun.String;
 const FetchEvent = WebCore.FetchEvent;
 const MacroMap = @import("../../resolver/package_json.zig").MacroMap;
 const TSConfigJSON = @import("../../resolver/tsconfig_json.zig").TSConfigJSON;
@@ -844,7 +844,7 @@ pub const JSBundler = struct {
 
                 this.value = .{
                     .success = .{
-                        .loader = @intToEnum(options.Loader, @intCast(u8, loader_as_int.to(i32))),
+                        .loader = @enumFromInt(options.Loader, @intCast(u8, loader_as_int.to(i32))),
                         .source_code = source_code,
                     },
                 };
@@ -871,16 +871,16 @@ pub const JSBundler = struct {
 
         extern fn JSBundlerPlugin__anyMatches(
             *Plugin,
-            namespaceString: *const ZigString,
-            path: *const ZigString,
+            namespaceString: *const String,
+            path: *const String,
             bool,
         ) bool;
 
         extern fn JSBundlerPlugin__matchOnLoad(
             *JSC.JSGlobalObject,
             *Plugin,
-            namespaceString: *const ZigString,
-            path: *const ZigString,
+            namespaceString: *const String,
+            path: *const String,
             context: *anyopaque,
             u8,
         ) void;
@@ -888,9 +888,9 @@ pub const JSBundler = struct {
         extern fn JSBundlerPlugin__matchOnResolve(
             *JSC.JSGlobalObject,
             *Plugin,
-            namespaceString: *const ZigString,
-            path: *const ZigString,
-            importer: *const ZigString,
+            namespaceString: *const String,
+            path: *const String,
+            importer: *const String,
             context: *anyopaque,
             u8,
         ) void;
@@ -905,10 +905,10 @@ pub const JSBundler = struct {
             defer tracer.end();
 
             const namespace_string = if (path.isFile())
-                ZigString.Empty
+                bun.String.empty
             else
-                ZigString.fromUTF8(path.namespace);
-            const path_string = ZigString.fromUTF8(path.text);
+                bun.String.create(path.namespace);
+            const path_string = bun.String.create(path.text);
             return JSBundlerPlugin__anyMatches(this, &namespace_string, &path_string, is_onLoad);
         }
 
@@ -924,11 +924,13 @@ pub const JSBundler = struct {
             const tracer = bun.tracy.traceNamed(@src(), "JSBundler.matchOnLoad");
             defer tracer.end();
             const namespace_string = if (namespace.len == 0)
-                ZigString.init("file")
+                bun.String.static("file")
             else
-                ZigString.fromUTF8(namespace);
-            const path_string = ZigString.fromUTF8(path);
-            JSBundlerPlugin__matchOnLoad(globalThis, this, &namespace_string, &path_string, context, @enumToInt(default_loader));
+                bun.String.create(namespace);
+            const path_string = bun.String.create(path);
+            defer namespace_string.deref();
+            defer path_string.deref();
+            JSBundlerPlugin__matchOnLoad(globalThis, this, &namespace_string, &path_string, context, @intFromEnum(default_loader));
         }
 
         pub fn matchOnResolve(
@@ -944,12 +946,15 @@ pub const JSBundler = struct {
             const tracer = bun.tracy.traceNamed(@src(), "JSBundler.matchOnResolve");
             defer tracer.end();
             const namespace_string = if (strings.eqlComptime(namespace, "file"))
-                ZigString.Empty
+                bun.String.empty
             else
-                ZigString.fromUTF8(namespace);
-            const path_string = ZigString.fromUTF8(path);
-            const importer_string = ZigString.fromUTF8(importer);
-            JSBundlerPlugin__matchOnResolve(globalThis, this, &namespace_string, &path_string, &importer_string, context, @enumToInt(import_record_kind));
+                bun.String.create(namespace);
+            const path_string = bun.String.create(path);
+            const importer_string = bun.String.create(importer);
+            defer namespace_string.deref();
+            defer path_string.deref();
+            defer importer_string.deref();
+            JSBundlerPlugin__matchOnResolve(globalThis, this, &namespace_string, &path_string, &importer_string, context, @intFromEnum(import_record_kind));
         }
 
         pub fn addPlugin(
diff --git a/src/bun.js/api/JSTranspiler.zig b/src/bun.js/api/JSTranspiler.zig
index a1e1cfa36..308738abf 100644
--- a/src/bun.js/api/JSTranspiler.zig
+++ b/src/bun.js/api/JSTranspiler.zig
@@ -85,7 +85,7 @@ const TranspilerOptions = struct {
 // This is going to be hard to not leak
 pub const TransformTask = struct {
     input_code: ZigString = ZigString.init(""),
-    protected_input_value: JSC.JSValue = @intToEnum(JSC.JSValue, 0),
+    protected_input_value: JSC.JSValue = @enumFromInt(JSC.JSValue, 0),
     output_code: ZigString = ZigString.init(""),
     bundler: Bundler.Bundler = undefined,
     log: logger.Log,
@@ -220,8 +220,8 @@ pub const TransformTask = struct {
 
         finish(this.output_code, this.global, promise);
 
-        if (@enumToInt(this.protected_input_value) != 0) {
-            this.protected_input_value = @intToEnum(JSC.JSValue, 0);
+        if (@intFromEnum(this.protected_input_value) != 0) {
+            this.protected_input_value = @enumFromInt(JSC.JSValue, 0);
         }
         this.deinit();
     }
@@ -611,7 +611,7 @@ fn transformOptionsFromJSC(globalObject: JSC.C.JSContextRef, temp_allocator: std
                 while (length_iter.next()) |value| {
                     if (value.isString()) {
                         const length = @truncate(u32, value.getLength(globalThis));
-                        string_count += @as(u32, @boolToInt(length > 0));
+                        string_count += @as(u32, @intFromBool(length > 0));
                         total_name_buf_len += length;
                     }
                 }
@@ -877,7 +877,7 @@ fn getParseResult(this: *Transpiler, allocator: std.mem.Allocator, code: []const
         for (res.ast.import_records.slice()) |*import| {
             if (import.kind.isCommonJS()) {
                 import.do_commonjs_transform_in_printer = true;
-                import.module_id = @truncate(u32, std.hash.Wyhash.hash(0, import.path.pretty));
+                import.module_id = @truncate(u32, bun.hash(import.path.pretty));
             }
         }
     }
diff --git a/src/bun.js/api/bun.zig b/src/bun.js/api/bun.zig
index 5580e8840..fbf567446 100644
--- a/src/bun.js/api/bun.zig
+++ b/src/bun.js/api/bun.zig
@@ -303,7 +303,7 @@ pub fn registerMacro(
         return js.JSValueMakeUndefined(ctx);
     }
     // TODO: make this faster
-    const id = @truncate(i32, @floatToInt(i64, js.JSValueToNumber(ctx, arguments[0], exception)));
+    const id = @truncate(i32, @intFromFloat(i64, js.JSValueToNumber(ctx, arguments[0], exception)));
     if (id == -1 or id == 0) {
         JSError(getAllocator(ctx), "Internal error registering macros: invalid id", .{}, ctx, exception);
         return js.JSValueMakeUndefined(ctx);
@@ -523,7 +523,7 @@ pub fn getFilePath(ctx: js.JSContextRef, arguments: []const js.JSValueRef, buf:
 
             temp_strings_list[temp_strings_list_len] = out_slice;
             // The dots are kind of unnecessary. They'll be normalized.
-            if (out.len == 0 or @ptrToInt(out.ptr) == 0 or std.mem.eql(u8, out_slice, ".") or std.mem.eql(u8, out_slice, "..") or std.mem.eql(u8, out_slice, "../")) {
+            if (out.len == 0 or @intFromPtr(out.ptr) == 0 or std.mem.eql(u8, out_slice, ".") or std.mem.eql(u8, out_slice, "..") or std.mem.eql(u8, out_slice, "../")) {
                 JSError(getAllocator(ctx), "Expected a file path as a string or an array of strings to be part of a file path.", .{}, ctx, exception);
                 return null;
             }
@@ -600,7 +600,7 @@ pub fn readFileAsStringCallback(
         return js.JSValueMakeUndefined(ctx);
     };
 
-    if (stat.kind != .File) {
+    if (stat.kind != .file) {
         JSError(getAllocator(ctx), "Can't read a {s} as a string (\"{s}\")", .{ @tagName(stat.kind), path }, ctx, exception);
         return js.JSValueMakeUndefined(ctx);
     }
@@ -641,7 +641,7 @@ pub fn readFileAsBytesCallback(
         return js.JSValueMakeUndefined(ctx);
     };
 
-    if (stat.kind != .File) {
+    if (stat.kind != .file) {
         JSError(allocator, "Can't read a {s} as a string (\"{s}\")", .{ @tagName(stat.kind), path }, ctx, exception);
         return js.JSValueMakeUndefined(ctx);
     }
@@ -896,6 +896,9 @@ pub fn createNodeFS(
 ) js.JSValueRef {
     var module = ctx.allocator().create(JSC.Node.NodeJSFS) catch unreachable;
     module.* = .{};
+    var vm = ctx.bunVM();
+    if (vm.standalone_module_graph != null)
+        module.node_fs.vm = vm;
 
     return module.toJS(ctx).asObjectRef();
 }
@@ -1612,7 +1615,7 @@ pub const Crypto = struct {
 
     fn createCryptoError(globalThis: *JSC.JSGlobalObject, err_code: u32) JSValue {
         var outbuf: [128 + 1 + "BoringSSL error: ".len]u8 = undefined;
-        @memset(&outbuf, 0, outbuf.len);
+        @memset(&outbuf, 0);
         outbuf[0.."BoringSSL error: ".len].* = "BoringSSL error: ".*;
         var message_buf = outbuf["BoringSSL error: ".len..];
 
@@ -3171,9 +3174,9 @@ pub fn mmapFile(
 
     return JSC.C.JSObjectMakeTypedArrayWithBytesNoCopy(ctx, JSC.C.JSTypedArrayType.kJSTypedArrayTypeUint8Array, @ptrCast(?*anyopaque, map.ptr), map.len, struct {
         pub fn x(ptr: ?*anyopaque, size: ?*anyopaque) callconv(.C) void {
-            _ = JSC.Node.Syscall.munmap(@ptrCast([*]align(std.mem.page_size) u8, @alignCast(std.mem.page_size, ptr))[0..@ptrToInt(size)]);
+            _ = JSC.Node.Syscall.munmap(@ptrCast([*]align(std.mem.page_size) u8, @alignCast(std.mem.page_size, ptr))[0..@intFromPtr(size)]);
         }
-    }.x, @intToPtr(?*anyopaque, map.len), exception);
+    }.x, @ptrFromInt(?*anyopaque, map.len), exception);
 }
 
 pub fn getTranspilerConstructor(
@@ -3401,7 +3404,7 @@ pub const Unsafe = struct {
         globalThis: *JSC.JSGlobalObject,
         value_: ?JSValue,
     ) JSValue {
-        const ret = JSValue.jsNumber(@as(i32, @enumToInt(globalThis.bunVM().aggressive_garbage_collection)));
+        const ret = JSValue.jsNumber(@as(i32, @intFromEnum(globalThis.bunVM().aggressive_garbage_collection)));
 
         if (value_) |value| {
             switch (value.coerce(i32, globalThis)) {
@@ -3712,21 +3715,32 @@ pub const Timer = struct {
             const kind = this.kind;
             var map: *TimeoutMap = vm.timer.maps.get(kind);
 
-            // This doesn't deinit the timer
-            // Timers are deinit'd separately
-            // We do need to handle when the timer is cancelled after the job has been enqueued
-            if (kind != .setInterval) {
-                if (map.fetchSwapRemove(this.id) == null) {
-                    // if the timeout was cancelled, don't run the callback
-                    this.deinit();
-                    return;
-                }
-            } else {
-                if (!map.contains(this.id)) {
-                    // if the interval was cancelled, don't run the callback
-                    this.deinit();
-                    return;
+            const should_cancel_job = brk: {
+                // This doesn't deinit the timer
+                // Timers are deinit'd separately
+                // We do need to handle when the timer is cancelled after the job has been enqueued
+                if (kind != .setInterval) {
+                    if (map.get(this.id)) |tombstone_or_timer| {
+                        break :brk tombstone_or_timer != null;
+                    } else {
+                        // clearTimeout has been called
+                        break :brk true;
+                    }
+                } else {
+                    if (map.get(this.id)) |tombstone_or_timer| {
+                        // .refresh() was called after CallbackJob enqueued
+                        break :brk tombstone_or_timer == null;
+                    }
                 }
+
+                break :brk false;
+            };
+
+            if (should_cancel_job) {
+                this.deinit();
+                return;
+            } else if (kind != .setInterval) {
+                _ = map.swapRemove(this.id);
             }
 
             var args_buf: [8]JSC.JSValue = undefined;
@@ -3791,6 +3805,8 @@ pub const Timer = struct {
                         result.then(globalThis, this, CallbackJob__onResolve, CallbackJob__onReject);
                     },
                 }
+            } else {
+                this.deinit();
             }
         }
     };
@@ -3820,10 +3836,29 @@ pub const Timer = struct {
             return timer_js;
         }
 
-        pub fn doRef(this: *TimerObject, _: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSValue {
+        pub fn doRef(this: *TimerObject, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSValue {
+            const this_value = callframe.this();
+            this_value.ensureStillAlive();
             if (this.ref_count > 0)
                 this.ref_count +|= 1;
-            return JSValue.jsUndefined();
+
+            var vm = globalObject.bunVM();
+            switch (this.kind) {
+                .setTimeout, .setImmediate, .setInterval => {
+                    if (vm.timer.maps.get(this.kind).getPtr(this.id)) |val_| {
+                        if (val_.*) |*val| {
+                            val.poll_ref.ref(vm);
+
+                            if (val.did_unref_timer) {
+                                val.did_unref_timer = false;
+                                vm.uws_event_loop.?.num_polls += 1;
+                            }
+                        }
+                    }
+                },
+            }
+
+            return this_value;
         }
 
         pub fn doRefresh(this: *TimerObject, globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSValue {
@@ -3912,27 +3947,34 @@ pub const Timer = struct {
                     id,
                     Timeout.run,
                     this.interval,
-                    @as(i32, @boolToInt(this.kind == .setInterval)) * this.interval,
+                    @as(i32, @intFromBool(this.kind == .setInterval)) * this.interval,
                 );
                 return this_value;
             }
             return JSValue.jsUndefined();
         }
 
-        pub fn doUnref(this: *TimerObject, globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSValue {
+        pub fn doUnref(this: *TimerObject, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSValue {
+            const this_value = callframe.this();
+            this_value.ensureStillAlive();
             this.ref_count -|= 1;
-            if (this.ref_count == 0) {
-                switch (this.kind) {
-                    .setTimeout, .setImmediate => {
-                        _ = clearTimeout(globalObject, JSValue.jsNumber(this.id));
-                    },
-                    .setInterval => {
-                        _ = clearInterval(globalObject, JSValue.jsNumber(this.id));
-                    },
-                }
+            var vm = globalObject.bunVM();
+            switch (this.kind) {
+                .setTimeout, .setImmediate, .setInterval => {
+                    if (vm.timer.maps.get(this.kind).getPtr(this.id)) |val_| {
+                        if (val_.*) |*val| {
+                            val.poll_ref.unref(vm);
+
+                            if (!val.did_unref_timer) {
+                                val.did_unref_timer = true;
+                                vm.uws_event_loop.?.num_polls -= 1;
+                            }
+                        }
+                    }
+                },
             }
 
-            return JSValue.jsUndefined();
+            return this_value;
         }
         pub fn hasRef(this: *TimerObject, globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSValue {
             return JSValue.jsBoolean(this.ref_count > 0 and globalObject.bunVM().timer.maps.get(this.kind).contains(this.id));
@@ -3954,6 +3996,7 @@ pub const Timer = struct {
         callback: JSC.Strong = .{},
         globalThis: *JSC.JSGlobalObject,
         timer: *uws.Timer,
+        did_unref_timer: bool = false,
         poll_ref: JSC.PollRef = JSC.PollRef.init(),
         arguments: JSC.Strong = .{},
 
@@ -4055,8 +4098,14 @@ pub const Timer = struct {
 
             var vm = this.globalThis.bunVM();
 
-            this.poll_ref.unrefOnNextTick(vm);
+            this.poll_ref.unref(vm);
+
             this.timer.deinit();
+            if (this.did_unref_timer) {
+                // balance double-unrefing
+                vm.uws_event_loop.?.num_polls += 1;
+            }
+
             this.callback.deinit();
             this.arguments.deinit();
         }
@@ -4130,7 +4179,7 @@ pub const Timer = struct {
             },
             Timeout.run,
             interval,
-            @as(i32, @boolToInt(kind == .setInterval)) * interval,
+            @as(i32, @intFromBool(kind == .setInterval)) * interval,
         );
     }
 
@@ -4318,7 +4367,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) u8, addr).*;
+            const value = @ptrFromInt(*align(1) u8, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn @"u16"(
@@ -4327,7 +4376,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) u16, addr).*;
+            const value = @ptrFromInt(*align(1) u16, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn @"u32"(
@@ -4336,7 +4385,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) u32, addr).*;
+            const value = @ptrFromInt(*align(1) u32, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn ptr(
@@ -4345,7 +4394,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) u64, addr).*;
+            const value = @ptrFromInt(*align(1) u64, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn @"i8"(
@@ -4354,7 +4403,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) i8, addr).*;
+            const value = @ptrFromInt(*align(1) i8, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn @"i16"(
@@ -4363,7 +4412,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) i16, addr).*;
+            const value = @ptrFromInt(*align(1) i16, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn @"i32"(
@@ -4372,7 +4421,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) i32, addr).*;
+            const value = @ptrFromInt(*align(1) i32, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn intptr(
@@ -4381,7 +4430,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) i64, addr).*;
+            const value = @ptrFromInt(*align(1) i64, addr).*;
             return JSValue.jsNumber(value);
         }
 
@@ -4391,7 +4440,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) f32, addr).*;
+            const value = @ptrFromInt(*align(1) f32, addr).*;
             return JSValue.jsNumber(value);
         }
 
@@ -4401,7 +4450,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) f64, addr).*;
+            const value = @ptrFromInt(*align(1) f64, addr).*;
             return JSValue.jsNumber(value);
         }
 
@@ -4411,7 +4460,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) i64, addr).*;
+            const value = @ptrFromInt(*align(1) i64, addr).*;
             return JSValue.fromInt64NoTruncate(global, value);
         }
 
@@ -4421,7 +4470,7 @@ pub const FFI = struct {
             arguments: []const JSValue,
         ) JSValue {
             const addr = arguments[0].asPtrAddress() + if (arguments.len > 1) @intCast(usize, arguments[1].to(i32)) else @as(usize, 0);
-            const value = @intToPtr(*align(1) u64, addr).*;
+            const value = @ptrFromInt(*align(1) u64, addr).*;
             return JSValue.fromUInt64NoTruncate(global, value);
         }
 
@@ -4432,7 +4481,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) u8, addr).*;
+            const value = @ptrFromInt(*align(1) u8, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn u16WithoutTypeChecks(
@@ -4442,7 +4491,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) u16, addr).*;
+            const value = @ptrFromInt(*align(1) u16, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn u32WithoutTypeChecks(
@@ -4452,7 +4501,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) u32, addr).*;
+            const value = @ptrFromInt(*align(1) u32, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn ptrWithoutTypeChecks(
@@ -4462,7 +4511,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) u64, addr).*;
+            const value = @ptrFromInt(*align(1) u64, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn i8WithoutTypeChecks(
@@ -4472,7 +4521,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) i8, addr).*;
+            const value = @ptrFromInt(*align(1) i8, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn i16WithoutTypeChecks(
@@ -4482,7 +4531,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) i16, addr).*;
+            const value = @ptrFromInt(*align(1) i16, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn i32WithoutTypeChecks(
@@ -4492,7 +4541,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) i32, addr).*;
+            const value = @ptrFromInt(*align(1) i32, addr).*;
             return JSValue.jsNumber(value);
         }
         pub fn intptrWithoutTypeChecks(
@@ -4502,7 +4551,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) i64, addr).*;
+            const value = @ptrFromInt(*align(1) i64, addr).*;
             return JSValue.jsNumber(value);
         }
 
@@ -4513,7 +4562,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) f32, addr).*;
+            const value = @ptrFromInt(*align(1) f32, addr).*;
             return JSValue.jsNumber(value);
         }
 
@@ -4524,7 +4573,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) f64, addr).*;
+            const value = @ptrFromInt(*align(1) f64, addr).*;
             return JSValue.jsNumber(value);
         }
 
@@ -4535,7 +4584,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) u64, addr).*;
+            const value = @ptrFromInt(*align(1) u64, addr).*;
             return JSValue.fromUInt64NoTruncate(global, value);
         }
 
@@ -4546,7 +4595,7 @@ pub const FFI = struct {
             offset: i32,
         ) callconv(.C) JSValue {
             const addr = @intCast(usize, raw_addr) + @intCast(usize, offset);
-            const value = @intToPtr(*align(1) i64, addr).*;
+            const value = @ptrFromInt(*align(1) i64, addr).*;
             return JSValue.fromInt64NoTruncate(global, value);
         }
 
@@ -4590,7 +4639,7 @@ pub const FFI = struct {
         _: *anyopaque,
         array: *JSC.JSUint8Array,
     ) callconv(.C) JSValue {
-        return JSValue.fromPtrAddress(@ptrToInt(array.ptr()));
+        return JSValue.fromPtrAddress(@intFromPtr(array.ptr()));
     }
 
     fn ptr_(
@@ -4610,9 +4659,9 @@ pub const FFI = struct {
             return JSC.toInvalidArguments("ArrayBufferView must have a length > 0. A pointer to empty memory doesn't work", .{}, globalThis);
         }
 
-        var addr: usize = @ptrToInt(array_buffer.ptr);
+        var addr: usize = @intFromPtr(array_buffer.ptr);
         // const Sizes = @import("../bindings/sizes.zig");
-        // std.debug.assert(addr == @ptrToInt(value.asEncoded().ptr) + Sizes.Bun_FFI_PointerOffsetToTypedArrayVector);
+        // std.debug.assert(addr == @intFromPtr(value.asEncoded().ptr) + Sizes.Bun_FFI_PointerOffsetToTypedArrayVector);
 
         if (byteOffset) |off| {
             if (!off.isEmptyOrUndefinedOrNull()) {
@@ -4628,7 +4677,7 @@ pub const FFI = struct {
                 addr += @intCast(usize, bytei64);
             }
 
-            if (addr > @ptrToInt(array_buffer.ptr) + @as(usize, array_buffer.byte_len)) {
+            if (addr > @intFromPtr(array_buffer.ptr) + @as(usize, array_buffer.byte_len)) {
                 return JSC.toInvalidArguments("byteOffset out of bounds", .{}, globalThis);
             }
         }
@@ -4720,11 +4769,11 @@ pub const FFI = struct {
                 }
 
                 const length = @intCast(usize, length_i);
-                return .{ .slice = @intToPtr([*]u8, addr)[0..length] };
+                return .{ .slice = @ptrFromInt([*]u8, addr)[0..length] };
             }
         }
 
-        return .{ .slice = bun.span(@intToPtr([*:0]u8, addr)) };
+        return .{ .slice = bun.span(@ptrFromInt([*:0]u8, addr)) };
     }
 
     fn getCPtr(value: JSValue) ?usize {
@@ -4759,11 +4808,11 @@ pub const FFI = struct {
                 var ctx: ?*anyopaque = null;
                 if (finalizationCallback) |callback_value| {
                     if (getCPtr(callback_value)) |callback_ptr| {
-                        callback = @intToPtr(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
+                        callback = @ptrFromInt(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
 
                         if (finalizationCtxOrPtr) |ctx_value| {
                             if (getCPtr(ctx_value)) |ctx_ptr| {
-                                ctx = @intToPtr(*anyopaque, ctx_ptr);
+                                ctx = @ptrFromInt(*anyopaque, ctx_ptr);
                             } else if (!ctx_value.isUndefinedOrNull()) {
                                 return JSC.toInvalidArguments("Expected user data to be a C pointer (number or BigInt)", .{}, globalThis);
                             }
@@ -4773,7 +4822,7 @@ pub const FFI = struct {
                     }
                 } else if (finalizationCtxOrPtr) |callback_value| {
                     if (getCPtr(callback_value)) |callback_ptr| {
-                        callback = @intToPtr(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
+                        callback = @ptrFromInt(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
                     } else if (!callback_value.isEmptyOrUndefinedOrNull()) {
                         return JSC.toInvalidArguments("Expected callback to be a C pointer (number or BigInt)", .{}, globalThis);
                     }
@@ -4801,11 +4850,11 @@ pub const FFI = struct {
                 var ctx: ?*anyopaque = null;
                 if (finalizationCallback) |callback_value| {
                     if (getCPtr(callback_value)) |callback_ptr| {
-                        callback = @intToPtr(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
+                        callback = @ptrFromInt(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
 
                         if (finalizationCtxOrPtr) |ctx_value| {
                             if (getCPtr(ctx_value)) |ctx_ptr| {
-                                ctx = @intToPtr(*anyopaque, ctx_ptr);
+                                ctx = @ptrFromInt(*anyopaque, ctx_ptr);
                             } else if (!ctx_value.isEmptyOrUndefinedOrNull()) {
                                 return JSC.toInvalidArguments("Expected user data to be a C pointer (number or BigInt)", .{}, globalThis);
                             }
@@ -4815,7 +4864,7 @@ pub const FFI = struct {
                     }
                 } else if (finalizationCtxOrPtr) |callback_value| {
                     if (getCPtr(callback_value)) |callback_ptr| {
-                        callback = @intToPtr(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
+                        callback = @ptrFromInt(JSC.C.JSTypedArrayBytesDeallocator, callback_ptr);
                     } else if (!callback_value.isEmptyOrUndefinedOrNull()) {
                         return JSC.toInvalidArguments("Expected callback to be a C pointer (number or BigInt)", .{}, globalThis);
                     }
@@ -4935,11 +4984,11 @@ pub const EnvironmentVariables = struct {
     pub fn getEnvNames(globalObject: *JSC.JSGlobalObject, names: []ZigString) usize {
         var vm = globalObject.bunVM();
         const keys = vm.bundler.env.map.map.keys();
-        const max = @min(names.len, keys.len);
-        for (keys[0..max], 0..) |key, i| {
-            names[i] = ZigString.initUTF8(key);
+        const len = @min(names.len, keys.len);
+        for (keys[0..len], names[0..len]) |key, *name| {
+            name.* = ZigString.initUTF8(key);
         }
-        return keys.len;
+        return len;
     }
     pub fn getEnvValue(globalObject: *JSC.JSGlobalObject, name: ZigString) ?ZigString {
         var vm = globalObject.bunVM();
diff --git a/src/bun.js/api/bun/dns_resolver.zig b/src/bun.js/api/bun/dns_resolver.zig
index aec295056..d0d4f5b7b 100644
--- a/src/bun.js/api/bun/dns_resolver.zig
+++ b/src/bun.js/api/bun/dns_resolver.zig
@@ -123,7 +123,7 @@ const LibInfo = struct {
                 this.vm.uws_event_loop.?,
                 .machport,
                 true,
-                @ptrToInt(request.backend.libinfo.machport),
+                @intFromPtr(request.backend.libinfo.machport),
             ) == .result,
         );
 
@@ -230,7 +230,7 @@ fn addrInfoCount(addrinfo: *std.c.addrinfo) u32 {
     var count: u32 = 1;
     var current: ?*std.c.addrinfo = addrinfo.next;
     while (current != null) : (current = current.?.next) {
-        count += @boolToInt(current.?.addr != null);
+        count += @intFromBool(current.?.addr != null);
     }
     return count;
 }
@@ -285,7 +285,7 @@ pub const GetAddrInfo = struct {
 
     pub fn toCAres(this: GetAddrInfo) bun.c_ares.AddrInfo_hints {
         var hints: bun.c_ares.AddrInfo_hints = undefined;
-        @memset(std.mem.asBytes(&hints), 0, @sizeOf(bun.c_ares.AddrInfo_hints));
+        @memset(std.mem.asBytes(&hints)[0..@sizeOf(bun.c_ares.AddrInfo_hints)], 0);
 
         hints.ai_family = this.options.family.toLibC();
         hints.ai_socktype = this.options.socktype.toLibC();
@@ -320,7 +320,7 @@ pub const GetAddrInfo = struct {
             }
 
             var hints: std.c.addrinfo = undefined;
-            @memset(std.mem.asBytes(&hints), 0, @sizeOf(std.c.addrinfo));
+            @memset(std.mem.asBytes(&hints)[0..@sizeOf(std.c.addrinfo)], 0);
 
             hints.family = this.family.toLibC();
             hints.socktype = this.socktype.toLibC();
@@ -793,7 +793,7 @@ pub const GetAddrInfoRequest = struct {
         addr_info: ?*std.c.addrinfo,
         arg: ?*anyopaque,
     ) callconv(.C) void {
-        const this = @intToPtr(*GetAddrInfoRequest, @ptrToInt(arg));
+        const this = @ptrFromInt(*GetAddrInfoRequest, @intFromPtr(arg));
         log("getAddrInfoAsyncCallback: status={d}", .{status});
 
         if (this.backend == .libinfo) {
@@ -846,8 +846,8 @@ pub const GetAddrInfoRequest = struct {
                     err,
                     debug_timer,
                 });
-                if (@enumToInt(err) != 0 or addrinfo == null) {
-                    this.* = .{ .err = @enumToInt(err) };
+                if (@intFromEnum(err) != 0 or addrinfo == null) {
+                    this.* = .{ .err = @intFromEnum(err) };
                     return;
                 }
 
@@ -1925,8 +1925,8 @@ pub const DNSResolver = struct {
             .err => |err| {
                 const system_error = JSC.SystemError{
                     .errno = -1,
-                    .code = JSC.ZigString.init(err.code()),
-                    .message = JSC.ZigString.init(err.label()),
+                    .code = bun.String.static(err.code()),
+                    .message = bun.String.static(err.label()),
                 };
 
                 globalThis.throwValue(system_error.toErrorInstance(globalThis));
@@ -1972,8 +1972,8 @@ pub const DNSResolver = struct {
             .err => |err| {
                 const system_error = JSC.SystemError{
                     .errno = -1,
-                    .code = JSC.ZigString.init(err.code()),
-                    .message = JSC.ZigString.init(err.label()),
+                    .code = bun.String.static(err.code()),
+                    .message = bun.String.static(err.label()),
                 };
 
                 globalThis.throwValue(system_error.toErrorInstance(globalThis));
diff --git a/src/bun.js/api/bun/socket.zig b/src/bun.js/api/bun/socket.zig
index 48bfe4218..1d85c705c 100644
--- a/src/bun.js/api/bun/socket.zig
+++ b/src/bun.js/api/bun/socket.zig
@@ -69,6 +69,11 @@ fn normalizeHost(input: anytype) @TypeOf(input) {
 
 const BinaryType = JSC.BinaryType;
 
+const WrappedType = enum {
+    none,
+    tls,
+    tcp,
+};
 const Handlers = struct {
     onOpen: JSC.JSValue = .zero,
     onClose: JSC.JSValue = .zero,
@@ -97,8 +102,8 @@ const Handlers = struct {
         handlers: *Handlers,
         socket_context: *uws.SocketContext,
 
-        pub fn exit(this: *Scope, ssl: bool) void {
-            this.handlers.markInactive(ssl, this.socket_context);
+        pub fn exit(this: *Scope, ssl: bool, wrapped: WrappedType) void {
+            this.handlers.markInactive(ssl, this.socket_context, wrapped);
         }
     };
 
@@ -123,19 +128,24 @@ const Handlers = struct {
         return true;
     }
 
-    pub fn markInactive(this: *Handlers, ssl: bool, ctx: *uws.SocketContext) void {
+    pub fn markInactive(this: *Handlers, ssl: bool, ctx: *uws.SocketContext, wrapped: WrappedType) void {
         Listener.log("markInactive", .{});
         this.active_connections -= 1;
-        if (this.active_connections == 0 and this.is_server) {
-            var listen_socket: *Listener = @fieldParentPtr(Listener, "handlers", this);
-            // allow it to be GC'd once the last connection is closed and it's not listening anymore
-            if (listen_socket.listener == null) {
-                listen_socket.strong_self.clear();
+        if (this.active_connections == 0) {
+            if (this.is_server) {
+                var listen_socket: *Listener = @fieldParentPtr(Listener, "handlers", this);
+                // allow it to be GC'd once the last connection is closed and it's not listening anymore
+                if (listen_socket.listener == null) {
+                    listen_socket.strong_self.clear();
+                }
+            } else {
+                this.unprotect();
+                // will deinit when is not wrapped or when is the TCP wrapped connection
+                if (wrapped != .tls) {
+                    ctx.deinit(ssl);
+                }
+                bun.default_allocator.destroy(this);
             }
-        } else if (this.active_connections == 0 and !this.is_server) {
-            this.unprotect();
-            ctx.deinit(ssl);
-            bun.default_allocator.destroy(this);
         }
     }
 
@@ -364,6 +374,7 @@ pub const Listener = struct {
     connection: UnixOrHost,
     socket_context: ?*uws.SocketContext = null,
     ssl: bool = false,
+    protos: ?[]const u8 = null,
 
     strong_data: JSC.Strong = .{},
     strong_self: JSC.Strong = .{},
@@ -395,13 +406,26 @@ pub const Listener = struct {
             port: u16,
         },
 
+        pub fn clone(this: UnixOrHost) UnixOrHost {
+            switch (this) {
+                .unix => |u| {
+                    return .{
+                        .unix = (bun.default_allocator.dupe(u8, u) catch unreachable),
+                    };
+                },
+                .host => |h| {
+                    return .{ .host = .{ .host = (bun.default_allocator.dupe(u8, h.host) catch unreachable), .port = this.host.port } };
+                },
+            }
+        }
+
         pub fn deinit(this: UnixOrHost) void {
             switch (this) {
                 .unix => |u| {
-                    bun.default_allocator.destroy(@intToPtr([*]u8, @ptrToInt(u.ptr)));
+                    bun.default_allocator.destroy(@ptrFromInt([*]u8, @intFromPtr(u.ptr)));
                 },
                 .host => |h| {
-                    bun.default_allocator.destroy(@intToPtr([*]u8, @ptrToInt(h.host.ptr)));
+                    bun.default_allocator.destroy(@ptrFromInt([*]u8, @intFromPtr(h.host.ptr)));
                 },
             }
         }
@@ -455,10 +479,12 @@ pub const Listener = struct {
         var socket_config = SocketConfig.fromJS(opts, globalObject, exception) orelse {
             return .zero;
         };
+
         var hostname_or_unix = socket_config.hostname_or_unix;
         var port = socket_config.port;
         var ssl = socket_config.ssl;
         var handlers = socket_config.handlers;
+        var protos: ?[]const u8 = null;
         const exclusive = socket_config.exclusive;
         handlers.is_server = true;
 
@@ -472,7 +498,7 @@ pub const Listener = struct {
         globalObject.bunVM().eventLoop().ensureWaker();
 
         var socket_context = uws.us_create_bun_socket_context(
-            @boolToInt(ssl_enabled),
+            @intFromBool(ssl_enabled),
             uws.Loop.get().?,
             @sizeOf(usize),
             ctx_opts,
@@ -483,7 +509,7 @@ pub const Listener = struct {
                 hostname_or_unix.deinit();
             }
 
-            const errno = @enumToInt(std.c.getErrno(-1));
+            const errno = @intFromEnum(std.c.getErrno(-1));
             if (errno != 0) {
                 err.put(globalObject, ZigString.static("errno"), JSValue.jsNumber(errno));
                 if (bun.C.SystemErrno.init(errno)) |str| {
@@ -496,6 +522,10 @@ pub const Listener = struct {
         };
 
         if (ssl_enabled) {
+            if (ssl.?.protos) |p| {
+                protos = p[0..ssl.?.protos_len];
+            }
+
             uws.NewSocketHandler(true).configure(
                 socket_context,
                 true,
@@ -544,7 +574,7 @@ pub const Listener = struct {
                     defer bun.default_allocator.free(host);
 
                     const socket = uws.us_socket_context_listen(
-                        @boolToInt(ssl_enabled),
+                        @intFromBool(ssl_enabled),
                         socket_context,
                         normalizeHost(@as([:0]const u8, host)),
                         c.port,
@@ -560,13 +590,13 @@ pub const Listener = struct {
                 .unix => |u| {
                     var host = bun.default_allocator.dupeZ(u8, u) catch unreachable;
                     defer bun.default_allocator.free(host);
-                    break :brk uws.us_socket_context_listen_unix(@boolToInt(ssl_enabled), socket_context, host, socket_flags, 8);
+                    break :brk uws.us_socket_context_listen_unix(@intFromBool(ssl_enabled), socket_context, host, socket_flags, 8);
                 },
             }
         } orelse {
             defer {
                 hostname_or_unix.deinit();
-                uws.us_socket_context_free(@boolToInt(ssl_enabled), socket_context);
+                uws.us_socket_context_free(@intFromBool(ssl_enabled), socket_context);
             }
 
             const err = globalObject.createErrorInstance(
@@ -575,7 +605,7 @@ pub const Listener = struct {
                     bun.span(hostname_or_unix.slice()),
                 },
             );
-            const errno = @enumToInt(std.c.getErrno(-1));
+            const errno = @intFromEnum(std.c.getErrno(-1));
             if (errno != 0) {
                 err.put(globalObject, ZigString.static("errno"), JSValue.jsNumber(errno));
                 if (bun.C.SystemErrno.init(errno)) |str| {
@@ -593,6 +623,7 @@ pub const Listener = struct {
             .ssl = ssl_enabled,
             .socket_context = socket_context,
             .listener = listen_socket,
+            .protos = if (protos) |p| (bun.default_allocator.dupe(u8, p) catch unreachable) else null,
         };
 
         socket.handlers.protect();
@@ -649,6 +680,8 @@ pub const Listener = struct {
             .handlers = &listener.handlers,
             .this_value = .zero,
             .socket = socket,
+            .protos = listener.protos,
+            .owned_protos = false,
         };
         if (listener.strong_data.get()) |default_data| {
             const globalObject = listener.handlers.globalObject;
@@ -715,6 +748,10 @@ pub const Listener = struct {
 
         this.handlers.unprotect();
         this.connection.deinit();
+        if (this.protos) |protos| {
+            this.protos = null;
+            bun.default_allocator.destroy(protos);
+        }
         bun.default_allocator.destroy(this);
     }
 
@@ -775,13 +812,17 @@ pub const Listener = struct {
         const socket_config = SocketConfig.fromJS(opts, globalObject, exception) orelse {
             return .zero;
         };
+
         var hostname_or_unix = socket_config.hostname_or_unix;
         var port = socket_config.port;
         var ssl = socket_config.ssl;
         var handlers = socket_config.handlers;
         var default_data = socket_config.default_data;
 
+        var protos: ?[]const u8 = null;
+        var server_name: ?[]const u8 = null;
         const ssl_enabled = ssl != null;
+        defer if (ssl != null) ssl.?.deinit();
 
         handlers.protect();
 
@@ -789,7 +830,7 @@ pub const Listener = struct {
 
         globalObject.bunVM().eventLoop().ensureWaker();
 
-        var socket_context = uws.us_create_bun_socket_context(@boolToInt(ssl_enabled), uws.Loop.get().?, @sizeOf(usize), ctx_opts).?;
+        var socket_context = uws.us_create_bun_socket_context(@intFromBool(ssl_enabled), uws.Loop.get().?, @sizeOf(usize), ctx_opts).?;
         var connection: Listener.UnixOrHost = if (port) |port_| .{
             .host = .{ .host = (hostname_or_unix.cloneIfNeeded(bun.default_allocator) catch unreachable).slice(), .port = port_ },
         } else .{
@@ -797,6 +838,12 @@ pub const Listener = struct {
         };
 
         if (ssl_enabled) {
+            if (ssl.?.protos) |p| {
+                protos = p[0..ssl.?.protos_len];
+            }
+            if (ssl.?.server_name) |s| {
+                server_name = bun.default_allocator.dupe(u8, s[0..bun.len(s)]) catch unreachable;
+            }
             uws.NewSocketHandler(true).configure(
                 socket_context,
                 true,
@@ -848,6 +895,8 @@ pub const Listener = struct {
                 .this_value = .zero,
                 .socket = undefined,
                 .connection = connection,
+                .protos = if (protos) |p| (bun.default_allocator.dupe(u8, p) catch unreachable) else null,
+                .server_name = server_name,
             };
 
             TLSSocket.dataSetCached(tls.getThisValue(globalObject), globalObject, default_data);
@@ -871,6 +920,8 @@ pub const Listener = struct {
                 .this_value = .zero,
                 .socket = undefined,
                 .connection = null,
+                .protos = null,
+                .server_name = null,
             };
 
             TCPSocket.dataSetCached(tcp.getThisValue(globalObject), globalObject, default_data);
@@ -898,11 +949,41 @@ fn JSSocketType(comptime ssl: bool) type {
     }
 }
 
+fn selectALPNCallback(
+    _: ?*BoringSSL.SSL,
+    out: [*c][*c]const u8,
+    outlen: [*c]u8,
+    in: [*c]const u8,
+    inlen: c_uint,
+    arg: ?*anyopaque,
+) callconv(.C) c_int {
+    const this = bun.cast(*TLSSocket, arg);
+    if (this.protos) |protos| {
+        if (protos.len == 0) {
+            return BoringSSL.SSL_TLSEXT_ERR_NOACK;
+        }
+
+        const status = BoringSSL.SSL_select_next_proto(bun.cast([*c][*c]u8, out), outlen, protos.ptr, @intCast(c_uint, protos.len), in, inlen);
+
+        // Previous versions of Node.js returned SSL_TLSEXT_ERR_NOACK if no protocol
+        // match was found. This would neither cause a fatal alert nor would it result
+        // in a useful ALPN response as part of the Server Hello message.
+        // We now return SSL_TLSEXT_ERR_ALERT_FATAL in that case as per Section 3.2
+        // of RFC 7301, which causes a fatal no_application_protocol alert.
+        const expected = if (comptime BoringSSL.OPENSSL_NPN_NEGOTIATED == 1) BoringSSL.SSL_TLSEXT_ERR_OK else BoringSSL.SSL_TLSEXT_ERR_ALERT_FATAL;
+
+        return if (status == expected) 1 else 0;
+    } else {
+        return BoringSSL.SSL_TLSEXT_ERR_NOACK;
+    }
+}
+
 fn NewSocket(comptime ssl: bool) type {
     return struct {
         pub const Socket = uws.NewSocketHandler(ssl);
         socket: Socket,
         detached: bool = false,
+        wrapped: WrappedType = .none,
         handlers: *Handlers,
         this_value: JSC.JSValue = .zero,
         poll_ref: JSC.PollRef = JSC.PollRef.init(),
@@ -910,6 +991,9 @@ fn NewSocket(comptime ssl: bool) type {
         last_4: [4]u8 = .{ 0, 0, 0, 0 },
         authorized: bool = false,
         connection: ?Listener.UnixOrHost = null,
+        protos: ?[]const u8,
+        owned_protos: bool = true,
+        server_name: ?[]const u8 = null,
 
         // TODO: switch to something that uses `visitAggregate` and have the
         // `Listener` keep a list of all the sockets JSValue in there
@@ -1022,8 +1106,8 @@ fn NewSocket(comptime ssl: bool) type {
             var globalObject = handlers.globalObject;
             const err = JSC.SystemError{
                 .errno = errno,
-                .message = ZigString.init("Failed to connect"),
-                .syscall = ZigString.init("connect"),
+                .message = bun.String.static("Failed to connect"),
+                .syscall = bun.String.static("connect"),
             };
 
             if (callback == .zero) {
@@ -1079,7 +1163,7 @@ fn NewSocket(comptime ssl: bool) type {
                 var vm = this.handlers.vm;
                 this.reffer.unref(vm);
 
-                this.handlers.markInactive(ssl, this.socket.context());
+                this.handlers.markInactive(ssl, this.socket.context(), this.wrapped);
                 this.poll_ref.unref(vm);
                 this.has_pending_activity.store(false, .Release);
             }
@@ -1091,25 +1175,42 @@ fn NewSocket(comptime ssl: bool) type {
 
             // Add SNI support for TLS (mongodb and others requires this)
             if (comptime ssl) {
-                if (this.connection) |connection| {
-                    if (connection == .host) {
-                        const host = normalizeHost(connection.host.host);
+                var ssl_ptr: *BoringSSL.SSL = @ptrCast(*BoringSSL.SSL, socket.getNativeHandle());
+                if (!ssl_ptr.isInitFinished()) {
+                    if (this.server_name) |server_name| {
+                        const host = normalizeHost(server_name);
                         if (host.len > 0) {
-                            var ssl_ptr: *BoringSSL.SSL = @ptrCast(*BoringSSL.SSL, socket.getNativeHandle());
-                            if (!ssl_ptr.isInitFinished()) {
+                            var host__ = default_allocator.dupeZ(u8, host) catch unreachable;
+                            defer default_allocator.free(host__);
+                            ssl_ptr.setHostname(host__);
+                        }
+                    } else if (this.connection) |connection| {
+                        if (connection == .host) {
+                            const host = normalizeHost(connection.host.host);
+                            if (host.len > 0) {
                                 var host__ = default_allocator.dupeZ(u8, host) catch unreachable;
                                 defer default_allocator.free(host__);
                                 ssl_ptr.setHostname(host__);
                             }
                         }
                     }
+                    if (this.protos) |protos| {
+                        if (this.handlers.is_server) {
+                            BoringSSL.SSL_CTX_set_alpn_select_cb(BoringSSL.SSL_get_SSL_CTX(ssl_ptr), selectALPNCallback, bun.cast(*anyopaque, this));
+                        } else {
+                            _ = BoringSSL.SSL_set_alpn_protos(ssl_ptr, protos.ptr, @intCast(c_uint, protos.len));
+                        }
+                    }
                 }
             }
 
             this.poll_ref.ref(this.handlers.vm);
             this.detached = false;
             this.socket = socket;
-            socket.ext(**anyopaque).?.* = bun.cast(**anyopaque, this);
+
+            if (this.wrapped == .none) {
+                socket.ext(**anyopaque).?.* = bun.cast(**anyopaque, this);
+            }
 
             const handlers = this.handlers;
             const callback = handlers.onOpen;
@@ -1161,6 +1262,8 @@ fn NewSocket(comptime ssl: bool) type {
         pub fn onEnd(this: *This, socket: Socket) void {
             JSC.markBinding(@src());
             log("onEnd", .{});
+            if (this.detached) return;
+
             this.detached = true;
             defer this.markInactive();
 
@@ -1174,7 +1277,7 @@ fn NewSocket(comptime ssl: bool) type {
             // the handlers must be kept alive for the duration of the function call
             // that way if we need to call the error handler, we can
             var scope = handlers.enter(socket.context());
-            defer scope.exit(ssl);
+            defer scope.exit(ssl, this.wrapped);
 
             const globalObject = handlers.globalObject;
             const this_value = this.getThisValue(globalObject);
@@ -1211,7 +1314,7 @@ fn NewSocket(comptime ssl: bool) type {
             // the handlers must be kept alive for the duration of the function call
             // that way if we need to call the error handler, we can
             var scope = handlers.enter(socket.context());
-            defer scope.exit(ssl);
+            defer scope.exit(ssl, this.wrapped);
 
             const globalObject = handlers.globalObject;
             const this_value = this.getThisValue(globalObject);
@@ -1232,8 +1335,8 @@ fn NewSocket(comptime ssl: bool) type {
                     const reason = if (ssl_error.reason == null) "" else ssl_error.reason[0..bun.len(ssl_error.reason)];
 
                     const fallback = JSC.SystemError{
-                        .code = ZigString.init(code),
-                        .message = ZigString.init(reason),
+                        .code = bun.String.create(code),
+                        .message = bun.String.create(reason),
                     };
 
                     authorization_error = fallback.toErrorInstance(globalObject);
@@ -1255,7 +1358,6 @@ fn NewSocket(comptime ssl: bool) type {
             log("onClose", .{});
             this.detached = true;
             defer this.markInactive();
-
             const handlers = this.handlers;
             this.poll_ref.unref(handlers.vm);
 
@@ -1265,7 +1367,7 @@ fn NewSocket(comptime ssl: bool) type {
             // the handlers must be kept alive for the duration of the function call
             // that way if we need to call the error handler, we can
             var scope = handlers.enter(socket.context());
-            defer scope.exit(ssl);
+            defer scope.exit(ssl, this.wrapped);
 
             var globalObject = handlers.globalObject;
             const this_value = this.getThisValue(globalObject);
@@ -1295,7 +1397,7 @@ fn NewSocket(comptime ssl: bool) type {
             // the handlers must be kept alive for the duration of the function call
             // that way if we need to call the error handler, we can
             var scope = handlers.enter(socket.context());
-            defer scope.exit(ssl);
+            defer scope.exit(ssl, this.wrapped);
 
             // const encoding = handlers.encoding;
             const result = callback.callWithThis(globalObject, this_value, &[_]JSValue{
@@ -1409,8 +1511,8 @@ fn NewSocket(comptime ssl: bool) type {
             const reason = if (ssl_error.reason == null) "" else ssl_error.reason[0..bun.len(ssl_error.reason)];
 
             const fallback = JSC.SystemError{
-                .code = ZigString.init(code),
-                .message = ZigString.init(reason),
+                .code = bun.String.create(code),
+                .message = bun.String.create(reason),
             };
 
             return fallback.toErrorInstance(globalObject);
@@ -1476,10 +1578,20 @@ fn NewSocket(comptime ssl: bool) type {
         }
 
         fn writeMaybeCorked(this: *This, buffer: []const u8, is_end: bool) i32 {
-            if (this.socket.isShutdown() or this.socket.isClosed()) {
+            if (this.detached or this.socket.isShutdown() or this.socket.isClosed()) {
                 return -1;
             }
             // we don't cork yet but we might later
+
+            if (comptime ssl) {
+                // TLS wrapped but in TCP mode
+                if (this.wrapped == .tcp) {
+                    const res = this.socket.rawWrite(buffer, is_end);
+                    log("write({d}, {any}) = {d}", .{ buffer.len, is_end, res });
+                    return res;
+                }
+            }
+
             const res = this.socket.write(buffer, is_end);
             log("write({d}, {any}) = {d}", .{ buffer.len, is_end, res });
             return res;
@@ -1487,7 +1599,6 @@ fn NewSocket(comptime ssl: bool) type {
 
         fn writeOrEnd(this: *This, globalObject: *JSC.JSGlobalObject, args: []const JSC.JSValue, is_end: bool) WriteResult {
             if (args.len == 0) return .{ .success = .{} };
-
             if (args.ptr[0].asArrayBuffer(globalObject)) |array_buffer| {
                 var slice = array_buffer.slice();
 
@@ -1681,9 +1792,6 @@ fn NewSocket(comptime ssl: bool) type {
                     if (result.wrote == result.total) {
                         this.socket.flush();
                         this.detached = true;
-                        if (!this.socket.isClosed()) {
-                            this.socket.close(0, null);
-                        }
                         this.markInactive();
                     }
                     break :brk JSValue.jsNumber(result.wrote);
@@ -1706,17 +1814,32 @@ fn NewSocket(comptime ssl: bool) type {
 
         pub fn finalize(this: *This) callconv(.C) void {
             log("finalize()", .{});
-            if (this.detached) return;
-            this.detached = true;
-            if (!this.socket.isClosed()) {
-                this.socket.close(0, null);
+            if (!this.detached) {
+                this.detached = true;
+                if (!this.socket.isClosed()) {
+                    this.socket.close(0, null);
+                }
+                this.markInactive();
             }
+
+            this.poll_ref.unref(JSC.VirtualMachine.get());
+            // need to deinit event without being attached
+            if (this.owned_protos) {
+                if (this.protos) |protos| {
+                    this.protos = null;
+                    default_allocator.free(protos);
+                }
+            }
+
+            if (this.server_name) |server_name| {
+                this.server_name = null;
+                default_allocator.free(server_name);
+            }
+
             if (this.connection) |connection| {
-                connection.deinit();
                 this.connection = null;
+                connection.deinit();
             }
-            this.markInactive();
-            this.poll_ref.unref(JSC.VirtualMachine.get());
         }
 
         pub fn reload(this: *This, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSValue {
@@ -1756,8 +1879,384 @@ fn NewSocket(comptime ssl: bool) type {
 
             return JSValue.jsUndefined();
         }
+
+        pub fn getALPNProtocol(
+            this: *This,
+            globalObject: *JSC.JSGlobalObject,
+        ) callconv(.C) JSValue {
+            if (comptime ssl == false) {
+                return JSValue.jsBoolean(false);
+            }
+
+            if (this.detached) {
+                return JSValue.jsBoolean(false);
+            }
+
+            var alpn_proto: [*c]const u8 = null;
+            var alpn_proto_len: u32 = 0;
+
+            var ssl_ptr: *BoringSSL.SSL = @ptrCast(*BoringSSL.SSL, this.socket.getNativeHandle());
+            BoringSSL.SSL_get0_alpn_selected(ssl_ptr, &alpn_proto, &alpn_proto_len);
+            if (alpn_proto == null or alpn_proto_len == 0) {
+                return JSValue.jsBoolean(false);
+            }
+
+            const slice = alpn_proto[0..alpn_proto_len];
+            if (strings.eql(slice, "h2")) {
+                return ZigString.static("h2").toValue(globalObject);
+            }
+            if (strings.eql(slice, "http/1.1")) {
+                return ZigString.static("http/1.1").toValue(globalObject);
+            }
+            return ZigString.fromUTF8(slice).toValueGC(globalObject);
+        }
+
+        pub fn setServername(
+            this: *This,
+            globalObject: *JSC.JSGlobalObject,
+            callframe: *JSC.CallFrame,
+        ) callconv(.C) JSValue {
+            if (comptime ssl == false) {
+                return JSValue.jsUndefined();
+            }
+
+            if (this.handlers.is_server) {
+                globalObject.throw("Cannot issue SNI from a TLS server-side socket", .{});
+                return .zero;
+            }
+
+            const args = callframe.arguments(1);
+            if (args.len < 1) {
+                globalObject.throw("Expected 1 argument", .{});
+                return .zero;
+            }
+
+            const server_name = args.ptr[0];
+            if (!server_name.isString()) {
+                globalObject.throw("Expected \"serverName\" to be a string", .{});
+                return .zero;
+            }
+
+            const slice = server_name.getZigString(globalObject).toOwnedSlice(bun.default_allocator) catch unreachable;
+            if (this.server_name) |old| {
+                this.server_name = slice;
+                default_allocator.free(old);
+            } else {
+                this.server_name = slice;
+            }
+
+            if (this.detached) {
+                // will be attached onOpen
+                return JSValue.jsUndefined();
+            }
+
+            const host = normalizeHost(@as([]const u8, slice));
+            if (host.len > 0) {
+                var ssl_ptr: *BoringSSL.SSL = @ptrCast(*BoringSSL.SSL, this.socket.getNativeHandle());
+                if (ssl_ptr.isInitFinished()) {
+                    // match node.js exceptions
+                    globalObject.throw("Already started.", .{});
+                    return .zero;
+                }
+                var host__ = default_allocator.dupeZ(u8, host) catch unreachable;
+                defer default_allocator.free(host__);
+                ssl_ptr.setHostname(host__);
+            }
+
+            return JSValue.jsUndefined();
+        }
+
+        // this invalidates the current socket returning 2 new sockets
+        // one for non-TLS and another for TLS
+        // handlers for non-TLS are preserved
+        pub fn upgradeTLS(
+            this: *This,
+            globalObject: *JSC.JSGlobalObject,
+            callframe: *JSC.CallFrame,
+        ) callconv(.C) JSValue {
+            JSC.markBinding(@src());
+            if (comptime ssl) {
+                return JSValue.jsUndefined();
+            }
+
+            if (this.detached) {
+                return JSValue.jsUndefined();
+            }
+
+            const args = callframe.arguments(1);
+
+            if (args.len < 1) {
+                globalObject.throw("Expected 1 arguments", .{});
+                return .zero;
+            }
+
+            var exception: JSC.C.JSValueRef = null;
+
+            const opts = args.ptr[0];
+            if (opts.isEmptyOrUndefinedOrNull() or opts.isBoolean() or !opts.isObject()) {
+                globalObject.throw("Expected options object", .{});
+                return .zero;
+            }
+
+            var socket_obj = opts.get(globalObject, "socket") orelse {
+                globalObject.throw("Expected \"socket\" option", .{});
+                return .zero;
+            };
+
+            var handlers = Handlers.fromJS(globalObject, socket_obj, &exception) orelse {
+                globalObject.throwValue(exception.?.value());
+                return .zero;
+            };
+
+            var ssl_opts: ?JSC.API.ServerConfig.SSLConfig = null;
+
+            if (opts.getTruthy(globalObject, "tls")) |tls| {
+                if (tls.isBoolean()) {
+                    if (tls.toBoolean()) {
+                        ssl_opts = JSC.API.ServerConfig.SSLConfig.zero;
+                    }
+                } else {
+                    if (JSC.API.ServerConfig.SSLConfig.inJS(globalObject, tls, &exception)) |ssl_config| {
+                        ssl_opts = ssl_config;
+                    } else if (exception != null) {
+                        return .zero;
+                    }
+                }
+            }
+
+            if (ssl_opts == null) {
+                globalObject.throw("Expected \"tls\" option", .{});
+                return .zero;
+            }
+
+            var default_data = JSValue.zero;
+            if (opts.getTruthy(globalObject, "data")) |default_data_value| {
+                default_data = default_data_value;
+                default_data.ensureStillAlive();
+            }
+
+            var socket_config = ssl_opts.?;
+            defer socket_config.deinit();
+            const options = socket_config.asUSockets();
+
+            const protos = socket_config.protos;
+            const protos_len = socket_config.protos_len;
+
+            const ext_size = @sizeOf(WrappedSocket);
+
+            const is_server = this.handlers.is_server;
+            var tls = handlers.vm.allocator.create(TLSSocket) catch @panic("OOM");
+            var handlers_ptr = handlers.vm.allocator.create(Handlers) catch @panic("OOM");
+            handlers_ptr.* = handlers;
+            handlers_ptr.is_server = is_server;
+            handlers_ptr.protect();
+
+            tls.* = .{
+                .handlers = handlers_ptr,
+                .this_value = .zero,
+                .socket = undefined,
+                .connection = if (this.connection) |c| c.clone() else null,
+                .wrapped = .tls,
+                .protos = if (protos) |p| (bun.default_allocator.dupe(u8, p[0..protos_len]) catch unreachable) else null,
+                .server_name = if (socket_config.server_name) |server_name| (bun.default_allocator.dupe(u8, server_name[0..bun.len(server_name)]) catch unreachable) else null,
+            };
+
+            var tls_js_value = tls.getThisValue(globalObject);
+            TLSSocket.dataSetCached(tls_js_value, globalObject, default_data);
+
+            const TCPHandler = NewWrappedHandler(false);
+
+            // reconfigure context to use the new wrapper handlers
+            Socket.unsafeConfigure(this.socket.context(), true, true, WrappedSocket, TCPHandler);
+            const old_context = this.socket.context();
+            const TLSHandler = NewWrappedHandler(true);
+            const new_socket = this.socket.wrapTLS(
+                options,
+                ext_size,
+                true,
+                WrappedSocket,
+                TLSHandler,
+            ) orelse {
+                handlers_ptr.unprotect();
+                handlers.vm.allocator.destroy(handlers_ptr);
+                bun.default_allocator.destroy(tls);
+                return JSValue.jsUndefined();
+            };
+
+            tls.socket = new_socket;
+
+            var raw = handlers.vm.allocator.create(TLSSocket) catch @panic("OOM");
+            var raw_handlers_ptr = handlers.vm.allocator.create(Handlers) catch @panic("OOM");
+            raw_handlers_ptr.* = .{
+                .vm = globalObject.bunVM(),
+                .globalObject = globalObject,
+                .onOpen = this.handlers.onOpen,
+                .onClose = this.handlers.onClose,
+                .onData = this.handlers.onData,
+                .onWritable = this.handlers.onWritable,
+                .onTimeout = this.handlers.onTimeout,
+                .onConnectError = this.handlers.onConnectError,
+                .onEnd = this.handlers.onEnd,
+                .onError = this.handlers.onError,
+                .onHandshake = this.handlers.onHandshake,
+                .binary_type = this.handlers.binary_type,
+                .is_server = is_server,
+            };
+            this.handlers.onOpen = .zero;
+            this.handlers.onClose = .zero;
+            this.handlers.onData = .zero;
+            this.handlers.onWritable = .zero;
+            this.handlers.onTimeout = .zero;
+            this.handlers.onConnectError = .zero;
+            this.handlers.onEnd = .zero;
+            this.handlers.onError = .zero;
+            this.handlers.onHandshake = .zero;
+            raw.* = .{
+                .handlers = raw_handlers_ptr,
+                .this_value = .zero,
+                .socket = new_socket,
+                .connection = if (this.connection) |c| c.clone() else null,
+                .wrapped = .tcp,
+                .protos = null,
+            };
+
+            var raw_js_value = raw.getThisValue(globalObject);
+            if (JSSocketType(ssl).dataGetCached(this.getThisValue(globalObject))) |raw_default_data| {
+                raw_default_data.ensureStillAlive();
+                TLSSocket.dataSetCached(raw_js_value, globalObject, raw_default_data);
+            }
+            // marks both as active
+            raw.markActive();
+            // this will keep tls alive until socket.open() is called to start TLS certificate and the handshake process
+            // open is not immediately called because we need to set bunSocketInternal
+            tls.markActive();
+
+            // mark both instances on socket data
+            new_socket.ext(WrappedSocket).?.* = .{ .tcp = raw, .tls = tls };
+
+            // start TLS handshake after we set ext
+            new_socket.startTLS(!this.handlers.is_server);
+
+            //detach and invalidate the old instance
+            this.detached = true;
+            if (this.reffer.has) {
+                var vm = this.handlers.vm;
+                this.reffer.unref(vm);
+                old_context.deinit(ssl);
+                bun.default_allocator.destroy(this.handlers);
+                this.poll_ref.unref(vm);
+                this.has_pending_activity.store(false, .Release);
+            }
+
+            const array = JSC.JSValue.createEmptyArray(globalObject, 2);
+            array.putIndex(globalObject, 0, raw_js_value);
+            array.putIndex(globalObject, 1, tls_js_value);
+            return array;
+        }
     };
 }
 
 pub const TCPSocket = NewSocket(false);
 pub const TLSSocket = NewSocket(true);
+
+pub const WrappedSocket = extern struct {
+    // both shares the same socket but one behaves as TLS and the other as TCP
+    tls: *TLSSocket,
+    tcp: *TLSSocket,
+};
+
+pub fn NewWrappedHandler(comptime tls: bool) type {
+    const Socket = uws.NewSocketHandler(true);
+    return struct {
+        pub fn onOpen(
+            this: WrappedSocket,
+            socket: Socket,
+        ) void {
+            // only TLS will call onOpen
+            if (comptime tls) {
+                TLSSocket.onOpen(this.tls, socket);
+            }
+        }
+
+        pub fn onEnd(
+            this: WrappedSocket,
+            socket: Socket,
+        ) void {
+            if (comptime tls) {
+                TLSSocket.onEnd(this.tls, socket);
+            } else {
+                TLSSocket.onEnd(this.tcp, socket);
+            }
+        }
+
+        pub fn onHandshake(
+            this: WrappedSocket,
+            socket: Socket,
+            success: i32,
+            ssl_error: uws.us_bun_verify_error_t,
+        ) void {
+            // only TLS will call onHandshake
+            if (comptime tls) {
+                TLSSocket.onHandshake(this.tls, socket, success, ssl_error);
+            }
+        }
+
+        pub fn onClose(
+            this: WrappedSocket,
+            socket: Socket,
+            err: c_int,
+            data: ?*anyopaque,
+        ) void {
+            if (comptime tls) {
+                TLSSocket.onClose(this.tls, socket, err, data);
+            } else {
+                TLSSocket.onClose(this.tcp, socket, err, data);
+            }
+        }
+
+        pub fn onData(
+            this: WrappedSocket,
+            socket: Socket,
+            data: []const u8,
+        ) void {
+            if (comptime tls) {
+                TLSSocket.onData(this.tls, socket, data);
+            } else {
+                TLSSocket.onData(this.tcp, socket, data);
+            }
+        }
+
+        pub fn onWritable(
+            this: WrappedSocket,
+            socket: Socket,
+        ) void {
+            if (comptime tls) {
+                TLSSocket.onWritable(this.tls, socket);
+            } else {
+                TLSSocket.onWritable(this.tcp, socket);
+            }
+        }
+        pub fn onTimeout(
+            this: WrappedSocket,
+            socket: Socket,
+        ) void {
+            if (comptime tls) {
+                TLSSocket.onTimeout(this.tls, socket);
+            } else {
+                TLSSocket.onTimeout(this.tcp, socket);
+            }
+        }
+
+        pub fn onConnectError(
+            this: WrappedSocket,
+            socket: Socket,
+            errno: c_int,
+        ) void {
+            if (comptime tls) {
+                TLSSocket.onConnectError(this.tls, socket, errno);
+            } else {
+                TLSSocket.onConnectError(this.tcp, socket, errno);
+            }
+        }
+    };
+}
diff --git a/src/bun.js/api/bun/subprocess.zig b/src/bun.js/api/bun/subprocess.zig
index 832afac78..ba813c463 100644
--- a/src/bun.js/api/bun/subprocess.zig
+++ b/src/bun.js/api/bun/subprocess.zig
@@ -1011,7 +1011,7 @@ pub const Subprocess = struct {
             if (signal.name()) |name|
                 return JSC.ZigString.init(name).toValueGC(global)
             else
-                return JSC.JSValue.jsNumber(@enumToInt(signal));
+                return JSC.JSValue.jsNumber(@intFromEnum(signal));
         }
 
         return JSC.JSValue.jsNull();
@@ -1535,9 +1535,9 @@ pub const Subprocess = struct {
                 }
 
                 if (std.os.W.IFSIGNALED(result.status)) {
-                    this.signal_code = @intToEnum(SignalCode, @truncate(u8, std.os.W.TERMSIG(result.status)));
+                    this.signal_code = @enumFromInt(SignalCode, @truncate(u8, std.os.W.TERMSIG(result.status)));
                 } else if (std.os.W.IFSTOPPED(result.status)) {
-                    this.signal_code = @intToEnum(SignalCode, @truncate(u8, std.os.W.STOPSIG(result.status)));
+                    this.signal_code = @enumFromInt(SignalCode, @truncate(u8, std.os.W.STOPSIG(result.status)));
                 }
 
                 if (!this.hasExited()) {
diff --git a/src/bun.js/api/ffi.zig b/src/bun.js/api/ffi.zig
index fe2b50955..ba31b67ed 100644
--- a/src/bun.js/api/ffi.zig
+++ b/src/bun.js/api/ffi.zig
@@ -137,8 +137,8 @@ pub const FFI = struct {
                     globalThis,
                     ZigString.static("ptr"),
                     ZigString.static("ctx"),
-                    JSC.JSValue.fromPtrAddress(@ptrToInt(function_.step.compiled.ptr)),
-                    JSC.JSValue.fromPtrAddress(@ptrToInt(function_)),
+                    JSC.JSValue.fromPtrAddress(@intFromPtr(function_.step.compiled.ptr)),
+                    JSC.JSValue.fromPtrAddress(@intFromPtr(function_)),
                 );
             },
         }
@@ -311,9 +311,9 @@ pub const FFI = struct {
                 break :brk std.DynLib.open(backup_name) catch {
                     // Then, if that fails, report an error.
                     const system_error = JSC.SystemError{
-                        .code = ZigString.init(@tagName(JSC.Node.ErrorCode.ERR_DLOPEN_FAILED)),
-                        .message = ZigString.init("Failed to open library. This is usually caused by a missing library or an invalid library path."),
-                        .syscall = ZigString.init("dlopen"),
+                        .code = bun.String.create(@tagName(JSC.Node.ErrorCode.ERR_DLOPEN_FAILED)),
+                        .message = bun.String.create("Failed to open library. This is usually caused by a missing library or an invalid library path."),
+                        .syscall = bun.String.create("dlopen"),
                     };
                     return system_error.toErrorInstance(global);
                 };
@@ -523,7 +523,7 @@ pub const FFI = struct {
                     const int = val.to(i32);
                     switch (int) {
                         0...ABIType.max => {
-                            abi_types.appendAssumeCapacity(@intToEnum(ABIType, int));
+                            abi_types.appendAssumeCapacity(@enumFromInt(ABIType, int));
                             continue;
                         },
                         else => {
@@ -560,7 +560,7 @@ pub const FFI = struct {
                 const int = ret_value.toInt32();
                 switch (int) {
                     0...ABIType.max => {
-                        return_type = @intToEnum(ABIType, int);
+                        return_type = @enumFromInt(ABIType, int);
                         break :brk;
                     },
                     else => {
@@ -594,11 +594,11 @@ pub const FFI = struct {
             if (ptr.isNumber()) {
                 const num = ptr.asPtrAddress();
                 if (num > 0)
-                    function.symbol_from_dynamic_library = @intToPtr(*anyopaque, num);
+                    function.symbol_from_dynamic_library = @ptrFromInt(*anyopaque, num);
             } else {
                 const num = ptr.toUInt64NoTruncate();
                 if (num > 0) {
-                    function.symbol_from_dynamic_library = @intToPtr(*anyopaque, num);
+                    function.symbol_from_dynamic_library = @ptrFromInt(*anyopaque, num);
                 }
             }
         }
@@ -866,7 +866,7 @@ pub const FFI = struct {
                 c: u8,
                 byte_count: usize,
             ) callconv(.C) void {
-                @memset(dest, c, byte_count);
+                @memset(dest[0..byte_count], c);
             }
 
             noinline fn memcpy(
@@ -874,7 +874,7 @@ pub const FFI = struct {
                 noalias source: [*]const u8,
                 byte_count: usize,
             ) callconv(.C) void {
-                @memcpy(dest, source, byte_count);
+                @memcpy(dest[0..byte_count], source[0..byte_count]);
             }
 
             pub fn define(state: *TCC.TCCState) void {
@@ -1205,7 +1205,7 @@ pub const FFI = struct {
             writer: anytype,
         ) !void {
             {
-                const ptr = @ptrToInt(globalObject);
+                const ptr = @intFromPtr(globalObject);
                 const fmt = bun.fmt.hexIntUpper(ptr);
                 try writer.print("#define JS_GLOBAL_OBJECT (void*)0x{any}ULL\n", .{fmt});
             }
@@ -1290,7 +1290,7 @@ pub const FFI = struct {
             var inner_buf: []u8 = &.{};
 
             {
-                const ptr = @ptrToInt(context_ptr);
+                const ptr = @intFromPtr(context_ptr);
                 const fmt = bun.fmt.hexIntUpper(ptr);
 
                 if (this.arg_types.items.len > 0) {
@@ -1355,7 +1355,7 @@ pub const FFI = struct {
 
         function = 17,
 
-        pub const max = @enumToInt(ABIType.function);
+        pub const max = @intFromEnum(ABIType.function);
 
         /// Types that we can directly pass through as an `int64_t`
         pub fn needsACastInC(this: ABIType) bool {
@@ -1414,11 +1414,11 @@ pub const FFI = struct {
                 // these are not all valid identifiers
                 try writer.writeAll(self.name);
                 try writer.writeAll("']:");
-                try std.fmt.formatInt(@enumToInt(self.entry), 10, .lower, .{}, writer);
+                try std.fmt.formatInt(@intFromEnum(self.entry), 10, .lower, .{}, writer);
                 try writer.writeAll(",'");
-                try std.fmt.formatInt(@enumToInt(self.entry), 10, .lower, .{}, writer);
+                try std.fmt.formatInt(@intFromEnum(self.entry), 10, .lower, .{}, writer);
                 try writer.writeAll("':");
-                try std.fmt.formatInt(@enumToInt(self.entry), 10, .lower, .{}, writer);
+                try std.fmt.formatInt(@intFromEnum(self.entry), 10, .lower, .{}, writer);
             }
         };
         pub const map_to_js_object = brk: {
@@ -1426,7 +1426,7 @@ pub const FFI = struct {
             for (map, 0..) |item, i| {
                 var fmt = EnumMapFormatter{ .name = item.@"0", .entry = item.@"1" };
                 count += std.fmt.count("{}", .{fmt});
-                count += @boolToInt(i > 0);
+                count += @intFromBool(i > 0);
             }
 
             var buf: [count]u8 = undefined;
diff --git a/src/bun.js/api/html_rewriter.zig b/src/bun.js/api/html_rewriter.zig
index bfbdb9a37..b309e07d7 100644
--- a/src/bun.js/api/html_rewriter.zig
+++ b/src/bun.js/api/html_rewriter.zig
@@ -106,7 +106,7 @@ pub const HTMLRewriter = struct {
 
         var selector = LOLHTML.HTMLSelector.parse(selector_slice) catch
             return throwLOLHTMLError(global);
-        var handler_ = ElementHandler.init(global, listener, exception);
+        var handler_ = ElementHandler.init(global, listener, exception) catch return .zero;
         if (exception.* != null) {
             selector.deinit();
             return JSValue.fromRef(exception.*);
@@ -154,7 +154,7 @@ pub const HTMLRewriter = struct {
         thisObject: JSC.C.JSObjectRef,
         exception: JSC.C.ExceptionRef,
     ) JSValue {
-        var handler_ = DocumentHandler.init(global, listener, exception);
+        var handler_ = DocumentHandler.init(global, listener, exception) catch return .zero;
         if (exception.* != null) {
             return JSValue.fromRef(exception.*);
         }
@@ -446,10 +446,14 @@ pub const HTMLRewriter = struct {
                 },
             };
 
-            result.body.init.headers = original.body.init.headers;
             result.body.init.method = original.body.init.method;
             result.body.init.status_code = original.body.init.status_code;
 
+            // https://github.com/oven-sh/bun/issues/3334
+            if (original.body.init.headers) |headers| {
+                result.body.init.headers = headers.cloneThis(global);
+            }
+
             result.url = bun.default_allocator.dupe(u8, original.url) catch unreachable;
             result.status_text = bun.default_allocator.dupe(u8, original.status_text) catch unreachable;
 
@@ -472,13 +476,13 @@ pub const HTMLRewriter = struct {
         pub fn onFinishedLoading(sink: *BufferOutputSink, bytes: JSC.WebCore.Blob.Store.ReadFile.ResultType) void {
             switch (bytes) {
                 .err => |err| {
-                    if (sink.response.body.value == .Locked and @ptrToInt(sink.response.body.value.Locked.task) == @ptrToInt(sink) and
+                    if (sink.response.body.value == .Locked and @intFromPtr(sink.response.body.value.Locked.task) == @intFromPtr(sink) and
                         sink.response.body.value.Locked.promise == null)
                     {
                         sink.response.body.value = .{ .Empty = {} };
                         // is there a pending promise?
                         // we will need to reject it
-                    } else if (sink.response.body.value == .Locked and @ptrToInt(sink.response.body.value.Locked.task) == @ptrToInt(sink) and
+                    } else if (sink.response.body.value == .Locked and @intFromPtr(sink.response.body.value.Locked.task) == @intFromPtr(sink) and
                         sink.response.body.value.Locked.promise != null)
                     {
                         sink.response.body.value.Locked.onReceiveValue = null;
@@ -723,29 +727,44 @@ const DocumentHandler = struct {
         "onEndCallback",
     );
 
-    pub fn init(global: *JSGlobalObject, thisObject: JSValue, exception: JSC.C.ExceptionRef) DocumentHandler {
+    pub fn init(global: *JSGlobalObject, thisObject: JSValue, exception: JSC.C.ExceptionRef) !DocumentHandler {
         var handler = DocumentHandler{
             .thisObject = thisObject,
             .global = global,
         };
 
-        switch (thisObject.jsType()) {
-            .Object, .ProxyObject, .Cell, .FinalObject => {},
-            else => |kind| {
-                JSC.throwInvalidArguments(
-                    "Expected object but received {s}",
-                    .{@as(string, @tagName(kind))},
-                    global,
-                    exception,
-                );
-                return undefined;
-            },
+        if (!thisObject.isObject()) {
+            JSC.throwInvalidArguments(
+                "Expected object",
+                .{},
+                global,
+                exception,
+            );
+            return error.InvalidArguments;
+        }
+
+        errdefer {
+            if (handler.onDocTypeCallback) |cb| {
+                cb.unprotect();
+            }
+
+            if (handler.onCommentCallback) |cb| {
+                cb.unprotect();
+            }
+
+            if (handler.onTextCallback) |cb| {
+                cb.unprotect();
+            }
+
+            if (handler.onEndCallback) |cb| {
+                cb.unprotect();
+            }
         }
 
         if (thisObject.get(global, "doctype")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("doctype must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onDocTypeCallback = val;
@@ -754,7 +773,7 @@ const DocumentHandler = struct {
         if (thisObject.get(global, "comments")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("comments must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onCommentCallback = val;
@@ -763,7 +782,7 @@ const DocumentHandler = struct {
         if (thisObject.get(global, "text")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("text must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onTextCallback = val;
@@ -772,7 +791,7 @@ const DocumentHandler = struct {
         if (thisObject.get(global, "end")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("end must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onEndCallback = val;
@@ -863,29 +882,39 @@ const ElementHandler = struct {
     global: *JSGlobalObject,
     ctx: ?*HTMLRewriter.BufferOutputSink = null,
 
-    pub fn init(global: *JSGlobalObject, thisObject: JSValue, exception: JSC.C.ExceptionRef) ElementHandler {
+    pub fn init(global: *JSGlobalObject, thisObject: JSValue, exception: JSC.C.ExceptionRef) !ElementHandler {
         var handler = ElementHandler{
             .thisObject = thisObject,
             .global = global,
         };
+        errdefer {
+            if (handler.onCommentCallback) |cb| {
+                cb.unprotect();
+            }
 
-        switch (thisObject.jsType()) {
-            .Object, .ProxyObject, .Cell, .FinalObject => {},
-            else => |kind| {
-                JSC.throwInvalidArguments(
-                    "Expected object but received {s}",
-                    .{@as(string, @tagName(kind))},
-                    global,
-                    exception,
-                );
-                return undefined;
-            },
+            if (handler.onElementCallback) |cb| {
+                cb.unprotect();
+            }
+
+            if (handler.onTextCallback) |cb| {
+                cb.unprotect();
+            }
+        }
+
+        if (!thisObject.isObject()) {
+            JSC.throwInvalidArguments(
+                "Expected object",
+                .{},
+                global,
+                exception,
+            );
+            return error.InvalidArguments;
         }
 
         if (thisObject.get(global, "element")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("element must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onElementCallback = val;
@@ -894,7 +923,7 @@ const ElementHandler = struct {
         if (thisObject.get(global, "comments")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("comments must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onCommentCallback = val;
@@ -903,7 +932,7 @@ const ElementHandler = struct {
         if (thisObject.get(global, "text")) |val| {
             if (val.isUndefinedOrNull() or !val.isCell() or !val.isCallable(global.vm())) {
                 JSC.throwInvalidArguments("text must be a function", .{}, global, exception);
-                return undefined;
+                return error.InvalidArguments;
             }
             JSC.C.JSValueProtect(global, val.asObjectRef());
             handler.onTextCallback = val;
@@ -967,26 +996,14 @@ const getterWrap = JSC.getterWrap;
 const setterWrap = JSC.setterWrap;
 const wrap = JSC.wrapSync;
 
-pub fn free_html_writer_string(_: ?*anyopaque, ptr: ?*anyopaque, len: usize) callconv(.C) void {
-    var str = LOLHTML.HTMLString{ .ptr = bun.cast([*]const u8, ptr.?), .len = len };
-    str.deinit();
-}
-
 fn throwLOLHTMLError(global: *JSGlobalObject) JSValue {
-    var err = LOLHTML.HTMLString.lastError();
-    return ZigString.init(err.slice()).toErrorInstance(global);
+    const err = LOLHTML.HTMLString.lastError();
+    defer err.deinit();
+    return ZigString.fromUTF8(err.slice()).toErrorInstance(global);
 }
 
 fn htmlStringValue(input: LOLHTML.HTMLString, globalObject: *JSGlobalObject) JSValue {
-    var str = ZigString.init(
-        input.slice(),
-    );
-    str.detectEncoding();
-
-    return str.toExternalValueWithCallback(
-        globalObject,
-        free_html_writer_string,
-    );
+    return input.toJS(globalObject);
 }
 
 pub const TextChunk = struct {
@@ -1016,6 +1033,9 @@ pub const TextChunk = struct {
             .removed = .{
                 .get = getterWrap(TextChunk, "removed"),
             },
+            .lastInTextNode = .{
+                .get = getterWrap(TextChunk, "lastInTextNode"),
+            },
             .text = .{
                 .get = getterWrap(TextChunk, "getText"),
             },
@@ -1084,6 +1104,10 @@ pub const TextChunk = struct {
         return JSC.JSValue.jsBoolean(this.text_chunk.?.isRemoved());
     }
 
+    pub fn lastInTextNode(this: *TextChunk, _: *JSGlobalObject) JSValue {
+        return JSC.JSValue.jsBoolean(this.text_chunk.?.isLastInTextNode());
+    }
+
     pub fn finalize(this: *TextChunk) void {
         this.text_chunk = null;
         bun.default_allocator.destroy(this);
@@ -1292,7 +1316,7 @@ pub const Comment = struct {
     pub fn getText(this: *Comment, global: *JSGlobalObject) JSValue {
         if (this.comment == null)
             return JSValue.jsNull();
-        return ZigString.init(this.comment.?.getText().slice()).withEncoding().toValueGC(global);
+        return this.comment.?.getText().toJS(global);
     }
 
     pub fn setText(
@@ -1422,7 +1446,7 @@ pub const EndTag = struct {
         if (this.end_tag == null)
             return JSC.JSValue.jsUndefined();
 
-        return ZigString.init(this.end_tag.?.getName().slice()).withEncoding().toValueGC(global);
+        return this.end_tag.?.getName().toJS(global);
     }
 
     pub fn setName(
@@ -1534,27 +1558,16 @@ pub const AttributeIterator = struct {
             return JSC.JSValue.jsNull();
         };
 
-        // TODO: don't clone here
         const value = attribute.value();
         const name = attribute.name();
-        defer name.deinit();
-        defer value.deinit();
 
-        var strs = [2]ZigString{
-            ZigString.init(name.slice()),
-            ZigString.init(value.slice()),
-        };
-
-        var valid_strs: []ZigString = strs[0..2];
-
-        var array = JSC.JSValue.createStringArray(
+        return bun.String.toJSArray(
             globalObject,
-            valid_strs.ptr,
-            valid_strs.len,
-            true,
+            &[_]bun.String{
+                name.toString(),
+                value.toString(),
+            },
         );
-
-        return array;
     }
 };
 pub const Element = struct {
@@ -1660,19 +1673,12 @@ pub const Element = struct {
 
         var slice = name.toSlice(bun.default_allocator);
         defer slice.deinit();
-        var attr = this.element.?.getAttribute(slice.slice()).slice();
+        var attr = this.element.?.getAttribute(slice.slice());
 
         if (attr.len == 0)
             return JSC.JSValue.jsNull();
 
-        var str = ZigString.init(
-            attr,
-        );
-
-        return str.toExternalValueWithCallback(
-            globalObject,
-            free_html_writer_string,
-        );
+        return attr.toJS(globalObject);
     }
 
     /// Returns a boolean indicating whether an attribute exists on the element.
@@ -1847,8 +1853,9 @@ pub const Element = struct {
     pub fn getNamespaceURI(this: *Element, globalObject: *JSGlobalObject) JSValue {
         if (this.element == null)
             return JSValue.jsUndefined();
-
-        return ZigString.init(std.mem.span(this.element.?.namespaceURI())).toValueGC(globalObject);
+        var str = bun.String.create(std.mem.span(this.element.?.namespaceURI()));
+        defer str.deref();
+        return str.toJS(globalObject);
     }
 
     pub fn getAttributes(this: *Element, globalObject: *JSGlobalObject) JSValue {
diff --git a/src/bun.js/api/server.zig b/src/bun.js/api/server.zig
index 37bc601a5..9625ff693 100644
--- a/src/bun.js/api/server.zig
+++ b/src/bun.js/api/server.zig
@@ -163,12 +163,14 @@ pub const ServerConfig = struct {
         request_cert: i32 = 0,
         reject_unauthorized: i32 = 0,
         ssl_ciphers: [*c]const u8 = null,
+        protos: [*c]const u8 = null,
+        protos_len: usize = 0,
 
         const log = Output.scoped(.SSLConfig, false);
 
         pub fn asUSockets(this_: ?SSLConfig) uws.us_bun_socket_context_options_t {
             var ctx_opts: uws.us_bun_socket_context_options_t = undefined;
-            @memset(@ptrCast([*]u8, &ctx_opts), 0, @sizeOf(uws.us_bun_socket_context_options_t));
+            @memset(@ptrCast([*]u8, &ctx_opts)[0..@sizeOf(uws.us_bun_socket_context_options_t)], 0);
 
             if (this_) |ssl_config| {
                 if (ssl_config.key_file_name != null)
@@ -181,7 +183,7 @@ pub const ServerConfig = struct {
                     ctx_opts.dh_params_file_name = ssl_config.dh_params_file_name;
                 if (ssl_config.passphrase != null)
                     ctx_opts.passphrase = ssl_config.passphrase;
-                ctx_opts.ssl_prefer_low_memory_usage = @boolToInt(ssl_config.low_memory_mode);
+                ctx_opts.ssl_prefer_low_memory_usage = @intFromBool(ssl_config.low_memory_mode);
 
                 if (ssl_config.key) |key| {
                     ctx_opts.key = key.ptr;
@@ -215,6 +217,7 @@ pub const ServerConfig = struct {
                 "dh_params_file_name",
                 "passphrase",
                 "ssl_ciphers",
+                "protos",
             };
 
             inline for (fields) |field| {
@@ -270,6 +273,9 @@ pub const ServerConfig = struct {
 
         pub fn inJS(global: *JSC.JSGlobalObject, obj: JSC.JSValue, exception: JSC.C.ExceptionRef) ?SSLConfig {
             var result = zero;
+            var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
+            defer arena.deinit();
+
             if (!obj.isObject()) {
                 JSC.throwInvalidArguments("tls option expects an object", .{}, global, exception);
                 return null;
@@ -301,7 +307,6 @@ pub const ServerConfig = struct {
 
                         var i: u32 = 0;
                         var valid_count: u32 = 0;
-                        var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
                         while (i < count) : (i += 1) {
                             const item = js_obj.getIndex(global, i);
                             if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), item, exception)) |sb| {
@@ -317,7 +322,6 @@ pub const ServerConfig = struct {
                                     valid_count += 1;
                                     any = true;
                                 } else {
-                                    arena.deinit();
                                     // mark and free all CA's
                                     result.cert = native_array;
                                     result.deinit();
@@ -325,7 +329,6 @@ pub const ServerConfig = struct {
                                 }
                             } else {
                                 global.throwInvalidArguments("key argument must be an string, Buffer, TypedArray, BunFile or an array containing string, Buffer, TypedArray or BunFile", .{});
-                                arena.deinit();
                                 // mark and free all keys
                                 result.key = native_array;
                                 result.deinit();
@@ -333,8 +336,6 @@ pub const ServerConfig = struct {
                             }
                         }
 
-                        arena.deinit();
-
                         if (valid_count == 0) {
                             bun.default_allocator.free(native_array);
                         } else {
@@ -356,7 +357,6 @@ pub const ServerConfig = struct {
                     }
                 } else {
                     const native_array = bun.default_allocator.alloc([*c]const u8, 1) catch unreachable;
-                    var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
                     if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), js_obj, exception)) |sb| {
                         const sliced = sb.slice();
                         if (sliced.len > 0) {
@@ -369,14 +369,11 @@ pub const ServerConfig = struct {
                         }
                     } else {
                         global.throwInvalidArguments("key argument must be an string, Buffer, TypedArray, BunFile or an array containing string, Buffer, TypedArray or BunFile", .{});
-                        arena.deinit();
                         // mark and free all certs
                         result.key = native_array;
                         result.deinit();
                         return null;
                     }
-
-                    arena.deinit();
                 }
             }
 
@@ -394,6 +391,22 @@ pub const ServerConfig = struct {
                 }
             }
 
+            if (obj.getTruthy(global, "ALPNProtocols")) |protocols| {
+                if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), protocols, exception)) |sb| {
+                    const sliced = sb.slice();
+                    if (sliced.len > 0) {
+                        result.protos = bun.default_allocator.dupeZ(u8, sliced) catch unreachable;
+                        result.protos_len = sliced.len;
+                    }
+
+                    any = true;
+                } else {
+                    global.throwInvalidArguments("ALPNProtocols argument must be an string, Buffer or TypedArray", .{});
+                    result.deinit();
+                    return null;
+                }
+            }
+
             if (obj.getTruthy(global, "cert")) |js_obj| {
                 if (js_obj.jsType().isArray()) {
                     const count = js_obj.getLength(global);
@@ -403,7 +416,6 @@ pub const ServerConfig = struct {
                         var i: u32 = 0;
                         var valid_count: u32 = 0;
 
-                        var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
                         while (i < count) : (i += 1) {
                             const item = js_obj.getIndex(global, i);
                             if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), item, exception)) |sb| {
@@ -419,7 +431,6 @@ pub const ServerConfig = struct {
                                     valid_count += 1;
                                     any = true;
                                 } else {
-                                    arena.deinit();
                                     // mark and free all CA's
                                     result.cert = native_array;
                                     result.deinit();
@@ -427,7 +438,6 @@ pub const ServerConfig = struct {
                                 }
                             } else {
                                 global.throwInvalidArguments("cert argument must be an string, Buffer, TypedArray, BunFile or an array containing string, Buffer, TypedArray or BunFile", .{});
-                                arena.deinit();
                                 // mark and free all certs
                                 result.cert = native_array;
                                 result.deinit();
@@ -435,8 +445,6 @@ pub const ServerConfig = struct {
                             }
                         }
 
-                        arena.deinit();
-
                         if (valid_count == 0) {
                             bun.default_allocator.free(native_array);
                         } else {
@@ -458,7 +466,6 @@ pub const ServerConfig = struct {
                     }
                 } else {
                     const native_array = bun.default_allocator.alloc([*c]const u8, 1) catch unreachable;
-                    var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
                     if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), js_obj, exception)) |sb| {
                         const sliced = sb.slice();
                         if (sliced.len > 0) {
@@ -471,14 +478,11 @@ pub const ServerConfig = struct {
                         }
                     } else {
                         global.throwInvalidArguments("cert argument must be an string, Buffer, TypedArray, BunFile or an array containing string, Buffer, TypedArray or BunFile", .{});
-                        arena.deinit();
                         // mark and free all certs
                         result.cert = native_array;
                         result.deinit();
                         return null;
                     }
-
-                    arena.deinit();
                 }
             }
 
@@ -518,7 +522,6 @@ pub const ServerConfig = struct {
                         var i: u32 = 0;
                         var valid_count: u32 = 0;
 
-                        var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
                         while (i < count) : (i += 1) {
                             const item = js_obj.getIndex(global, i);
                             if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), item, exception)) |sb| {
@@ -534,7 +537,6 @@ pub const ServerConfig = struct {
                                     valid_count += 1;
                                     any = true;
                                 } else {
-                                    arena.deinit();
                                     // mark and free all CA's
                                     result.cert = native_array;
                                     result.deinit();
@@ -542,7 +544,6 @@ pub const ServerConfig = struct {
                                 }
                             } else {
                                 global.throwInvalidArguments("ca argument must be an string, Buffer, TypedArray, BunFile or an array containing string, Buffer, TypedArray or BunFile", .{});
-                                arena.deinit();
                                 // mark and free all CA's
                                 result.cert = native_array;
                                 result.deinit();
@@ -550,8 +551,6 @@ pub const ServerConfig = struct {
                             }
                         }
 
-                        arena.deinit();
-
                         if (valid_count == 0) {
                             bun.default_allocator.free(native_array);
                         } else {
@@ -573,7 +572,6 @@ pub const ServerConfig = struct {
                     }
                 } else {
                     const native_array = bun.default_allocator.alloc([*c]const u8, 1) catch unreachable;
-                    var arena: @import("root").bun.ArenaAllocator = @import("root").bun.ArenaAllocator.init(bun.default_allocator);
                     if (JSC.Node.StringOrBuffer.fromJS(global, arena.allocator(), js_obj, exception)) |sb| {
                         const sliced = sb.slice();
                         if (sliced.len > 0) {
@@ -586,13 +584,11 @@ pub const ServerConfig = struct {
                         }
                     } else {
                         JSC.throwInvalidArguments("ca argument must be an string, Buffer, TypedArray, BunFile or an array containing string, Buffer, TypedArray or BunFile", .{}, global, exception);
-                        arena.deinit();
                         // mark and free all certs
                         result.ca = native_array;
                         result.deinit();
                         return null;
                     }
-                    arena.deinit();
                 }
             }
 
@@ -1000,6 +996,30 @@ const HTTPStatusText = struct {
     }
 };
 
+fn NewFlags(comptime debug_mode: bool) type {
+    return packed struct {
+        has_marked_complete: bool = false,
+        has_marked_pending: bool = false,
+        has_abort_handler: bool = false,
+        has_sendfile_ctx: bool = false,
+        has_called_error_handler: bool = false,
+        needs_content_length: bool = false,
+        needs_content_range: bool = false,
+        /// Used to avoid looking at the uws.Request struct after it's been freed
+        is_transfer_encoding: bool = false,
+
+        /// Used to identify if request can be safely deinitialized
+        is_waiting_body: bool = false,
+        /// Used in renderMissing in debug mode to show the user an HTML page
+        /// Used to avoid looking at the uws.Request struct after it's been freed
+        is_web_browser_navigation: if (debug_mode) bool else void = if (debug_mode) false else {},
+        has_written_status: bool = false,
+        response_protected: bool = false,
+        aborted: bool = false,
+        finalized: bun.DebugOnly(bool) = bun.DebugOnlyDefault(false),
+    };
+}
+
 // This is defined separately partially to work-around an LLVM debugger bug.
 fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comptime ThisServer: type) type {
     return struct {
@@ -1024,63 +1044,42 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         req: *uws.Request,
         signal: ?*JSC.WebCore.AbortSignal = null,
         method: HTTP.Method,
-        aborted: bool = false,
-        finalized: bun.DebugOnly(bool) = bun.DebugOnlyDefault(false),
+
+        flags: NewFlags(debug_mode) = .{},
+
         upgrade_context: ?*uws.uws_socket_context_t = null,
 
         /// We can only safely free once the request body promise is finalized
         /// and the response is rejected
+        response_jsvalue: JSC.JSValue = JSC.JSValue.zero,
         pending_promises_for_abort: u8 = 0,
 
-        has_marked_complete: bool = false,
-        has_marked_pending: bool = false,
-
-        response_jsvalue: JSC.JSValue = JSC.JSValue.zero,
-        response_protected: bool = false,
         response_ptr: ?*JSC.WebCore.Response = null,
         blob: JSC.WebCore.AnyBlob = JSC.WebCore.AnyBlob{ .Blob = .{} },
         promise: ?*JSC.JSValue = null,
-        has_abort_handler: bool = false,
-        has_sendfile_ctx: bool = false,
-        has_called_error_handler: bool = false,
-        needs_content_length: bool = false,
-        needs_content_range: bool = false,
+
         sendfile: SendfileContext = undefined,
         request_body: ?*JSC.WebCore.BodyValueRef = null,
         request_body_buf: std.ArrayListUnmanaged(u8) = .{},
         request_body_content_len: usize = 0,
 
-        /// Used to avoid looking at the uws.Request struct after it's been freed
-        is_transfer_encoding: bool = false,
-
-        /// Used to identify if request can be safely deinitialized
-        is_waiting_body: bool = false,
-
-        /// Used in renderMissing in debug mode to show the user an HTML page
-        /// Used to avoid looking at the uws.Request struct after it's been freed
-        is_web_browser_navigation: if (debug_mode) bool else void = if (debug_mode) false else {},
-
         sink: ?*ResponseStream.JSSink = null,
         byte_stream: ?*JSC.WebCore.ByteStream = null,
 
         /// Used in errors
         pathname: []const u8 = "",
 
-        has_written_status: bool = false,
-
         /// Used either for temporary blob data or fallback
         /// When the response body is a temporary value
         response_buf_owned: std.ArrayListUnmanaged(u8) = .{},
 
-        keepalive: bool = true,
-
         // TODO: support builtin compression
         const can_sendfile = !ssl_enabled;
 
         pub fn setAbortHandler(this: *RequestContext) void {
-            if (this.has_abort_handler) return;
+            if (this.flags.has_abort_handler) return;
             if (this.resp) |resp| {
-                this.has_abort_handler = true;
+                this.flags.has_abort_handler = true;
                 resp.onAborted(*RequestContext, RequestContext.onAbort, this);
             }
         }
@@ -1094,7 +1093,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             result.ensureStillAlive();
 
             ctx.pending_promises_for_abort -|= 1;
-            if (ctx.aborted) {
+            if (ctx.flags.aborted) {
                 ctx.finalizeForAbort();
                 return JSValue.jsUndefined();
             }
@@ -1121,8 +1120,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 return;
             };
             ctx.response_jsvalue = value;
-            std.debug.assert(!ctx.response_protected);
-            ctx.response_protected = true;
+            std.debug.assert(!ctx.flags.response_protected);
+            ctx.flags.response_protected = true;
             JSC.C.JSValueProtect(ctx.server.globalThis, value.asObjectRef());
 
             ctx.render(response);
@@ -1143,7 +1142,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
             ctx.pending_promises_for_abort -|= 1;
 
-            if (ctx.aborted) {
+            if (ctx.flags.aborted) {
                 ctx.finalizeForAbort();
                 return JSValue.jsUndefined();
             }
@@ -1163,7 +1162,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                     value,
                 );
 
-            if (ctx.aborted) {
+            if (ctx.flags.aborted) {
                 ctx.finalizeForAbort();
                 return;
             }
@@ -1174,7 +1173,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 return;
             }
 
-            if (!resp.hasResponded() and !ctx.has_marked_pending) {
+            if (!resp.hasResponded() and !ctx.flags.has_marked_pending) {
                 ctx.renderMissing();
                 return;
             }
@@ -1190,14 +1189,14 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         pub fn renderMissingCorked(ctx: *RequestContext) void {
             if (ctx.resp) |resp| {
                 if (comptime !debug_mode) {
-                    if (!ctx.has_written_status)
+                    if (!ctx.flags.has_written_status)
                         resp.writeStatus("204 No Content");
-                    ctx.has_written_status = true;
+                    ctx.flags.has_written_status = true;
                     ctx.end("", ctx.shouldCloseConnection());
                 } else {
-                    if (ctx.is_web_browser_navigation) {
+                    if (ctx.flags.is_web_browser_navigation) {
                         resp.writeStatus("200 OK");
-                        ctx.has_written_status = true;
+                        ctx.flags.has_written_status = true;
 
                         resp.writeHeader("content-type", MimeType.html.value);
                         resp.writeHeader("content-encoding", "gzip");
@@ -1206,9 +1205,9 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                         return;
                     }
 
-                    if (!ctx.has_written_status)
+                    if (!ctx.flags.has_written_status)
                         resp.writeStatus("200 OK");
-                    ctx.has_written_status = true;
+                    ctx.flags.has_written_status = true;
                     ctx.end("Welcome to Bun! To get started, return a Response object.", ctx.shouldCloseConnection());
                 }
             }
@@ -1222,8 +1221,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             comptime fmt: string,
             args: anytype,
         ) void {
-            if (!this.has_written_status) {
-                this.has_written_status = true;
+            if (!this.flags.has_written_status) {
+                this.flags.has_written_status = true;
                 if (this.resp) |resp| {
                     resp.writeStatus("500 Internal Server Error");
                     resp.writeHeader("content-type", MimeType.html.value);
@@ -1240,7 +1239,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 .reason = .fetch_event_handler,
                 .cwd = VirtualMachine.get().bundler.fs.top_level_dir,
                 .problems = Api.Problems{
-                    .code = @truncate(u16, @errorToInt(err)),
+                    .code = @truncate(u16, @intFromError(err)),
                     .name = @errorName(err),
                     .exceptions = exceptions,
                     .build = log.toAPI(allocator) catch unreachable,
@@ -1265,7 +1264,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 return;
             }
 
-            this.has_marked_pending = true;
+            this.flags.has_marked_pending = true;
             this.response_buf_owned = std.ArrayListUnmanaged(u8){ .items = bb.items, .capacity = bb.capacity };
 
             if (this.resp) |resp| {
@@ -1290,7 +1289,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                     this.response_buf_owned.items.len,
                     this.shouldCloseConnection(),
                 )) {
-                    this.has_marked_pending = true;
+                    this.flags.has_marked_pending = true;
                     resp.onWritable(*RequestContext, onWritableCompleteResponseBuffer, this);
                     this.setAbortHandler();
                     return;
@@ -1314,8 +1313,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn end(this: *RequestContext, data: []const u8, closeConnection: bool) void {
             if (this.resp) |resp| {
-                if (this.is_waiting_body) {
-                    this.is_waiting_body = false;
+                if (this.flags.is_waiting_body) {
+                    this.flags.is_waiting_body = false;
                     resp.clearOnData();
                 }
                 resp.end(data, closeConnection);
@@ -1325,8 +1324,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn endStream(this: *RequestContext, closeConnection: bool) void {
             if (this.resp) |resp| {
-                if (this.is_waiting_body) {
-                    this.is_waiting_body = false;
+                if (this.flags.is_waiting_body) {
+                    this.flags.is_waiting_body = false;
                     resp.clearOnData();
                 }
                 resp.endStream(closeConnection);
@@ -1336,8 +1335,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn endWithoutBody(this: *RequestContext, closeConnection: bool) void {
             if (this.resp) |resp| {
-                if (this.is_waiting_body) {
-                    this.is_waiting_body = false;
+                if (this.flags.is_waiting_body) {
+                    this.flags.is_waiting_body = false;
                     resp.clearOnData();
                 }
                 resp.endWithoutBody(closeConnection);
@@ -1347,7 +1346,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn onWritableResponseBuffer(this: *RequestContext, _: c_ulong, resp: *App.Response) callconv(.C) bool {
             std.debug.assert(this.resp == resp);
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return false;
             }
@@ -1360,12 +1359,12 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         pub fn onWritableCompleteResponseBufferAndMetadata(this: *RequestContext, write_offset: c_ulong, resp: *App.Response) callconv(.C) bool {
             std.debug.assert(this.resp == resp);
 
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return false;
             }
 
-            if (!this.has_written_status) {
+            if (!this.flags.has_written_status) {
                 this.renderMetadata();
             }
 
@@ -1380,7 +1379,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn onWritableCompleteResponseBuffer(this: *RequestContext, write_offset: c_ulong, resp: *App.Response) callconv(.C) bool {
             std.debug.assert(this.resp == resp);
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return false;
             }
@@ -1417,9 +1416,9 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn onAbort(this: *RequestContext, resp: *App.Response) void {
             std.debug.assert(this.resp == resp);
-            std.debug.assert(!this.aborted);
+            std.debug.assert(!this.flags.aborted);
             //mark request as aborted
-            this.aborted = true;
+            this.flags.aborted = true;
 
             // if signal is not aborted, abort the signal
             if (this.signal) |signal| {
@@ -1456,12 +1455,11 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                         // the promise is pending
                         if (body.value.Locked.action != .none or body.value.Locked.promise != null) {
                             this.pending_promises_for_abort += 1;
-                            body.value.toErrorInstance(JSC.toTypeError(.ABORT_ERR, "Request aborted", .{}, this.server.globalThis), this.server.globalThis);
                         } else if (body.value.Locked.readable != null) {
                             body.value.Locked.readable.?.abort(this.server.globalThis);
-                            body.value.toErrorInstance(JSC.toTypeError(.ABORT_ERR, "Request aborted", .{}, this.server.globalThis), this.server.globalThis);
                             body.value.Locked.readable = null;
                         }
+                        body.value.toErrorInstance(JSC.toTypeError(.ABORT_ERR, "Request aborted", .{}, this.server.globalThis), this.server.globalThis);
                     }
                 }
 
@@ -1488,8 +1486,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         }
 
         pub fn markComplete(this: *RequestContext) void {
-            if (!this.has_marked_complete) this.server.onRequestComplete();
-            this.has_marked_complete = true;
+            if (!this.flags.has_marked_complete) this.server.onRequestComplete();
+            this.flags.has_marked_complete = true;
         }
 
         // This function may be called multiple times
@@ -1499,15 +1497,15 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             this.blob.detach();
 
             if (comptime Environment.allow_assert) {
-                std.debug.assert(!this.finalized);
-                this.finalized = true;
+                std.debug.assert(!this.flags.finalized);
+                this.flags.finalized = true;
             }
 
             if (!this.response_jsvalue.isEmpty()) {
                 ctxLog("finalizeWithoutDeinit: response_jsvalue != .zero", .{});
-                if (this.response_protected) {
+                if (this.flags.response_protected) {
                     this.response_jsvalue.unprotect();
-                    this.response_protected = false;
+                    this.flags.response_protected = false;
                 }
                 this.response_jsvalue = JSC.JSValue.zero;
             }
@@ -1515,7 +1513,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             // if signal is not aborted, abort the signal
             if (this.signal) |signal| {
                 this.signal = null;
-                if (this.aborted and !signal.aborted()) {
+                if (this.flags.aborted and !signal.aborted()) {
                     const reason = JSC.WebCore.AbortSignal.createAbortError(JSC.ZigString.static("The user aborted a request"), &JSC.ZigString.Empty, this.server.globalThis);
                     reason.ensureStillAlive();
                     _ = signal.signal(reason);
@@ -1558,9 +1556,9 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
             // if we are waiting for the body yet and the request was not aborted we can safely clear the onData callback
             if (this.resp) |resp| {
-                if (this.is_waiting_body and this.aborted == false) {
+                if (this.flags.is_waiting_body and this.flags.aborted == false) {
                     resp.clearOnData();
-                    this.is_waiting_body = false;
+                    this.flags.is_waiting_body = false;
                 }
             }
         }
@@ -1574,10 +1572,10 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         pub fn deinit(this: *RequestContext) void {
             ctxLog("deinit<d> ({*})<r>", .{this});
             if (comptime Environment.allow_assert)
-                std.debug.assert(this.finalized);
+                std.debug.assert(this.flags.finalized);
 
             if (comptime Environment.allow_assert)
-                std.debug.assert(this.has_marked_complete);
+                std.debug.assert(this.flags.has_marked_complete);
 
             var server = this.server;
             this.request_body_buf.clearAndFree(this.allocator);
@@ -1605,8 +1603,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn writeStatus(this: *RequestContext, status: u16) void {
             var status_text_buf: [48]u8 = undefined;
-            std.debug.assert(!this.has_written_status);
-            this.has_written_status = true;
+            std.debug.assert(!this.flags.has_written_status);
+            this.flags.has_written_status = true;
 
             if (this.resp) |resp| {
                 if (HTTPStatusText.get(status)) |text| {
@@ -1635,7 +1633,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         }};
 
         pub fn onSendfile(this: *RequestContext) bool {
-            if (this.aborted or this.resp == null) {
+            if (this.flags.aborted or this.resp == null) {
                 this.cleanupAndFinalizeAfterSendfile();
                 return false;
             }
@@ -1657,7 +1655,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
                 this.sendfile.remain -|= @intCast(Blob.SizeType, this.sendfile.offset -| start);
 
-                if (errcode != .SUCCESS or this.aborted or this.sendfile.remain == 0 or val == 0) {
+                if (errcode != .SUCCESS or this.flags.aborted or this.sendfile.remain == 0 or val == 0) {
                     if (errcode != .AGAIN and errcode != .SUCCESS and errcode != .PIPE) {
                         Output.prettyErrorln("Error: {s}", .{@tagName(errcode)});
                         Output.flush();
@@ -1680,7 +1678,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 const wrote = @intCast(Blob.SizeType, sbytes);
                 this.sendfile.offset +|= wrote;
                 this.sendfile.remain -|= wrote;
-                if (errcode != .AGAIN or this.aborted or this.sendfile.remain == 0 or sbytes == 0) {
+                if (errcode != .AGAIN or this.flags.aborted or this.sendfile.remain == 0 or sbytes == 0) {
                     if (errcode != .AGAIN and errcode != .SUCCESS and errcode != .PIPE) {
                         Output.prettyErrorln("Error: {s}", .{@tagName(errcode)});
                         Output.flush();
@@ -1692,7 +1690,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
             if (!this.sendfile.has_set_on_writable) {
                 this.sendfile.has_set_on_writable = true;
-                this.has_marked_pending = true;
+                this.flags.has_marked_pending = true;
                 resp.onWritable(*RequestContext, onWritableSendfile, this);
             }
 
@@ -1704,7 +1702,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn onWritableBytes(this: *RequestContext, write_offset: c_ulong, resp: *App.Response) callconv(.C) bool {
             std.debug.assert(this.resp == resp);
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return false;
             }
@@ -1725,7 +1723,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 this.finalize();
                 return true;
             } else {
-                this.has_marked_pending = true;
+                this.flags.has_marked_pending = true;
                 resp.onWritable(*RequestContext, onWritableBytes, this);
                 return true;
             }
@@ -1739,7 +1737,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 this.response_buf_owned.items.len = 0;
                 this.finalize();
             } else {
-                this.has_marked_pending = true;
+                this.flags.has_marked_pending = true;
                 resp.onWritable(*RequestContext, onWritableCompleteResponseBuffer, this);
             }
 
@@ -1790,11 +1788,11 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                     }
 
                     var err = JSC.Node.Syscall.Error{
-                        .errno = @intCast(JSC.Node.Syscall.Error.Int, @enumToInt(std.os.E.INVAL)),
+                        .errno = @intCast(JSC.Node.Syscall.Error.Int, @intFromEnum(std.os.E.INVAL)),
                         .syscall = .sendfile,
                     };
                     var sys = err.withPathLike(file.pathlike).toSystemError();
-                    sys.message = ZigString.init("MacOS does not support sending non-regular files");
+                    sys.message = bun.String.static("MacOS does not support sending non-regular files");
                     this.runErrorHandler(sys.toErrorInstance(
                         this.server.globalThis,
                     ));
@@ -1809,11 +1807,11 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                     }
 
                     var err = JSC.Node.Syscall.Error{
-                        .errno = @intCast(JSC.Node.Syscall.Error.Int, @enumToInt(std.os.E.INVAL)),
+                        .errno = @intCast(JSC.Node.Syscall.Error.Int, @intFromEnum(std.os.E.INVAL)),
                         .syscall = .sendfile,
                     };
                     var sys = err.withPathLike(file.pathlike).toSystemError();
-                    sys.message = ZigString.init("File must be regular or FIFO");
+                    sys.message = bun.String.static("File must be regular or FIFO");
                     this.runErrorHandler(sys.toErrorInstance(
                         this.server.globalThis,
                     ));
@@ -1828,21 +1826,21 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             else
                 @min(original_size, stat_size);
 
-            this.needs_content_length = true;
+            this.flags.needs_content_length = true;
 
             this.sendfile = .{
                 .fd = fd,
                 .remain = this.blob.Blob.offset + original_size,
                 .offset = this.blob.Blob.offset,
                 .auto_close = auto_close,
-                .socket_fd = if (!this.aborted) resp.getNativeHandle() else -999,
+                .socket_fd = if (!this.flags.aborted) resp.getNativeHandle() else -999,
             };
 
             // if we are sending only part of a file, include the content-range header
             // only include content-range automatically when using a file path instead of an fd
             // this is to better support manually controlling the behavior
             if (std.os.S.ISREG(stat.mode) and auto_close) {
-                this.needs_content_range = (this.sendfile.remain -| this.sendfile.offset) != stat_size;
+                this.flags.needs_content_range = (this.sendfile.remain -| this.sendfile.offset) != stat_size;
             }
 
             // we know the bounds when we are sending a regular file
@@ -1869,14 +1867,14 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         }
 
         pub fn doSendfile(this: *RequestContext, blob: Blob) void {
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return;
             }
 
-            if (this.has_sendfile_ctx) return;
+            if (this.flags.has_sendfile_ctx) return;
 
-            this.has_sendfile_ctx = true;
+            this.flags.has_sendfile_ctx = true;
 
             if (comptime can_sendfile) {
                 return this.renderSendFile(blob);
@@ -1887,7 +1885,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         }
 
         pub fn onReadFile(this: *RequestContext, result: Blob.Store.ReadFile.ResultType) void {
-            if (this.aborted or this.resp == null) {
+            if (this.flags.aborted or this.resp == null) {
                 this.finalizeForAbort();
                 return;
             }
@@ -1910,8 +1908,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 else
                     @min(original_size, stat_size);
 
-                if (!this.has_written_status)
-                    this.needs_content_range = true;
+                if (!this.flags.has_written_status)
+                    this.flags.needs_content_range = true;
 
                 // this is used by content-range
                 this.sendfile = .{
@@ -1932,14 +1930,14 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         }
 
         fn renderWithBlobFromBodyValue(this: *RequestContext) void {
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return;
             }
 
             if (this.blob.needsToReadFile()) {
                 this.req.setYield(false);
-                if (!this.has_sendfile_ctx)
+                if (!this.flags.has_sendfile_ctx)
                     this.doSendfile(this.blob.Blob);
                 return;
             }
@@ -1952,7 +1950,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         fn doRenderStream(pair: *StreamPair) void {
             var this = pair.this;
             var stream = pair.stream;
-            if (this.resp == null or this.aborted) {
+            if (this.resp == null or this.flags.aborted) {
                 stream.value.unprotect();
                 this.finalizeForAbort();
                 return;
@@ -2003,11 +2001,11 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 }
             }
 
-            this.aborted = this.aborted or response_stream.sink.aborted;
+            this.flags.aborted = this.flags.aborted or response_stream.sink.aborted;
 
             if (assignment_result.toError()) |err_value| {
                 streamLog("returned an error", .{});
-                if (!this.aborted) resp.clearAborted();
+                if (!this.flags.aborted) resp.clearAborted();
                 response_stream.detach();
                 this.sink = null;
                 response_stream.sink.destroy();
@@ -2019,7 +2017,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 // TODO: is there a condition where resp could be freed before done?
                 resp.hasResponded())
             {
-                if (!this.aborted) resp.clearAborted();
+                if (!this.flags.aborted) resp.clearAborted();
                 const wrote_anything = response_stream.sink.wrote > 0;
                 streamLog("is done", .{});
                 const responded = resp.hasResponded();
@@ -2027,10 +2025,10 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 response_stream.detach();
                 this.sink = null;
                 response_stream.sink.destroy();
-                if (!responded and !wrote_anything and !this.aborted) {
+                if (!responded and !wrote_anything and !this.flags.aborted) {
                     this.renderMissing();
                     return;
-                } else if (wrote_anything and !responded and !this.aborted) {
+                } else if (wrote_anything and !responded and !this.flags.aborted) {
                     this.endStream(this.shouldCloseConnection());
                 }
 
@@ -2078,7 +2076,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 }
             }
 
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 response_stream.detach();
                 stream.cancel(this.server.globalThis);
                 response_stream.sink.done = true;
@@ -2113,7 +2111,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         const streamLog = Output.scoped(.ReadableStream, false);
 
         pub fn didUpgradeWebSocket(this: *RequestContext) bool {
-            return @ptrToInt(this.upgrade_context) == std.math.maxInt(usize);
+            return @intFromPtr(this.upgrade_context) == std.math.maxInt(usize);
         }
 
         pub fn onResponse(
@@ -2127,7 +2125,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             request_value.ensureStillAlive();
             response_value.ensureStillAlive();
 
-            if (ctx.aborted) {
+            if (ctx.flags.aborted) {
                 ctx.finalizeForAbort();
                 return;
             }
@@ -2154,19 +2152,19 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             if (response_value.as(JSC.WebCore.Response)) |response| {
                 ctx.response_jsvalue = response_value;
                 ctx.response_jsvalue.ensureStillAlive();
-                ctx.response_protected = false;
+                ctx.flags.response_protected = false;
                 response.body.value.toBlobIfPossible();
 
                 switch (response.body.value) {
                     .Blob => |*blob| {
                         if (blob.needsToReadFile()) {
                             response_value.protect();
-                            ctx.response_protected = true;
+                            ctx.flags.response_protected = true;
                         }
                     },
                     .Locked => {
                         response_value.protect();
-                        ctx.response_protected = true;
+                        ctx.flags.response_protected = true;
                     },
                     else => {},
                 }
@@ -2204,19 +2202,19 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
                         ctx.response_jsvalue = fulfilled_value;
                         ctx.response_jsvalue.ensureStillAlive();
-                        ctx.response_protected = false;
+                        ctx.flags.response_protected = false;
                         ctx.response_ptr = response;
                         response.body.value.toBlobIfPossible();
                         switch (response.body.value) {
                             .Blob => |*blob| {
                                 if (blob.needsToReadFile()) {
                                     fulfilled_value.protect();
-                                    ctx.response_protected = true;
+                                    ctx.flags.response_protected = true;
                                 }
                             },
                             .Locked => {
                                 fulfilled_value.protect();
-                                ctx.response_protected = true;
+                                ctx.flags.response_protected = true;
                             },
                             else => {},
                         }
@@ -2259,7 +2257,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             }
             if (ctx.resp) |resp| {
                 // The user returned something that wasn't a promise or a promise with a response
-                if (!resp.hasResponded() and !ctx.has_marked_pending) ctx.renderMissing();
+                if (!resp.hasResponded() and !ctx.flags.has_marked_pending) ctx.renderMissing();
             }
         }
 
@@ -2270,7 +2268,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             if (req.sink) |wrapper| {
                 wrapper.sink.pending_flush = null;
                 wrapper.sink.done = true;
-                req.aborted = req.aborted or wrapper.sink.aborted;
+                req.flags.aborted = req.flags.aborted or wrapper.sink.aborted;
                 wrote_anything = wrapper.sink.wrote > 0;
                 wrapper.sink.finalize();
                 wrapper.detach();
@@ -2288,7 +2286,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             streamLog("onResolve({any})", .{wrote_anything});
 
             //aborted so call finalizeForAbort
-            if (req.aborted or req.resp == null) {
+            if (req.flags.aborted or req.resp == null) {
                 req.finalizeForAbort();
                 return;
             }
@@ -2326,13 +2324,13 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn handleRejectStream(req: *@This(), globalThis: *JSC.JSGlobalObject, err: JSValue) void {
             streamLog("handleRejectStream", .{});
-            var wrote_anything = req.has_written_status;
+            var wrote_anything = req.flags.has_written_status;
 
             if (req.sink) |wrapper| {
                 wrapper.sink.pending_flush = null;
                 wrapper.sink.done = true;
                 wrote_anything = wrote_anything or wrapper.sink.wrote > 0;
-                req.aborted = req.aborted or wrapper.sink.aborted;
+                req.flags.aborted = req.flags.aborted or wrapper.sink.aborted;
                 wrapper.sink.finalize();
                 wrapper.detach();
                 req.sink = null;
@@ -2349,7 +2347,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             streamLog("onReject({any})", .{wrote_anything});
 
             //aborted so call finalizeForAbort
-            if (req.aborted) {
+            if (req.flags.aborted) {
                 req.finalizeForAbort();
                 return;
             }
@@ -2373,8 +2371,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             }
 
             const fallback = JSC.SystemError{
-                .code = ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_UNHANDLED_ERROR))),
-                .message = ZigString.init("Unhandled error in ReadableStream"),
+                .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_UNHANDLED_ERROR))),
+                .message = bun.String.static("Unhandled error in ReadableStream"),
             };
             req.handleReject(fallback.toErrorInstance(globalThis));
         }
@@ -2388,7 +2386,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 .Error => {
                     const err = value.Error;
                     _ = value.use();
-                    if (this.aborted) {
+                    if (this.flags.aborted) {
                         this.finalizeForAbort();
                         return;
                     }
@@ -2406,7 +2404,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                     return;
                 },
                 .Locked => |*lock| {
-                    if (this.aborted) {
+                    if (this.flags.aborted) {
                         this.finalizeForAbort();
                         return;
                     }
@@ -2420,8 +2418,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                         if (stream.isLocked(this.server.globalThis)) {
                             streamLog("was locked but it shouldn't be", .{});
                             var err = JSC.SystemError{
-                                .code = ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_STREAM_CANNOT_PIPE))),
-                                .message = ZigString.init("Stream already used, please create a new one"),
+                                .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_STREAM_CANNOT_PIPE))),
+                                .message = bun.String.static("Stream already used, please create a new one"),
                             };
                             stream.value.unprotect();
                             this.runErrorHandler(err.toErrorInstance(this.server.globalThis));
@@ -2509,7 +2507,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                 }
             }
 
-            if (this.aborted or this.resp == null) {
+            if (this.flags.aborted or this.resp == null) {
                 this.finalizeForAbort();
                 return;
             }
@@ -2528,7 +2526,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             } else {
                 // when it's the last one, we just want to know if it's done
                 if (stream.isDone()) {
-                    this.has_marked_pending = true;
+                    this.flags.has_marked_pending = true;
                     resp.onWritable(*RequestContext, onWritableResponseBuffer, this);
                 }
             }
@@ -2540,7 +2538,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             // Faster to do the memcpy than to do the two network calls
             // We are not streaming
             // This is an important performance optimization
-            if (this.has_abort_handler and this.blob.fastSize() < 16384 - 1024) {
+            if (this.flags.has_abort_handler and this.blob.fastSize() < 16384 - 1024) {
                 if (this.resp) |resp| {
                     resp.runCorkedWithType(*RequestContext, doRenderBlobCorked, this);
                 }
@@ -2557,7 +2555,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
         pub fn doRender(this: *RequestContext) void {
             ctxLog("render", .{});
 
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 this.finalizeForAbort();
                 return;
             }
@@ -2569,17 +2567,17 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             if (this.resp) |resp| {
                 switch (status) {
                     404 => {
-                        if (!this.has_written_status) {
+                        if (!this.flags.has_written_status) {
                             resp.writeStatus("404 Not Found");
-                            this.has_written_status = true;
+                            this.flags.has_written_status = true;
                         }
                         this.endWithoutBody(this.shouldCloseConnection());
                     },
                     else => {
-                        if (!this.has_written_status) {
+                        if (!this.flags.has_written_status) {
                             resp.writeStatus("500 Internal Server Error");
                             resp.writeHeader("content-type", "text/plain");
-                            this.has_written_status = true;
+                            this.flags.has_written_status = true;
                         }
 
                         this.end("Something went wrong!", this.shouldCloseConnection());
@@ -2600,7 +2598,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             if (this.pathname.len > 0)
                 return this.pathname;
 
-            if (!this.has_abort_handler) {
+            if (!this.flags.has_abort_handler) {
                 return this.req.url();
             }
 
@@ -2643,8 +2641,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             status: u16,
         ) void {
             JSC.markBinding(@src());
-            if (!this.server.config.onError.isEmpty() and !this.has_called_error_handler) {
-                this.has_called_error_handler = true;
+            if (!this.server.config.onError.isEmpty() and !this.flags.has_called_error_handler) {
+                this.flags.has_called_error_handler = true;
                 var args = [_]JSC.C.JSValueRef{value.asObjectRef()};
                 const result = JSC.C.JSObjectCallAsFunctionReturnValue(this.server.globalThis, this.server.config.onError.asObjectRef(), this.server.thisObject.asObjectRef(), 1, &args);
                 defer result.ensureStillAlive();
@@ -2679,7 +2677,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
             var response: *JSC.WebCore.Response = this.response_ptr.?;
             var status = response.statusCode();
-            var needs_content_range = this.needs_content_range and this.sendfile.remain < this.blob.size();
+            var needs_content_range = this.flags.needs_content_range and this.sendfile.remain < this.blob.size();
 
             const size = if (needs_content_range)
                 this.sendfile.remain
@@ -2744,26 +2742,22 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             // 1. Bun.file("foo")
             // 2. The content-disposition header is not present
             if (!has_content_disposition and content_type.category.autosetFilename()) {
-                if (this.blob.store()) |store| {
-                    if (store.data == .file) {
-                        if (store.data.file.pathlike == .path) {
-                            const basename = std.fs.path.basename(store.data.file.pathlike.path.slice());
-                            if (basename.len > 0) {
-                                var filename_buf: [1024]u8 = undefined;
-
-                                resp.writeHeader(
-                                    "content-disposition",
-                                    std.fmt.bufPrint(&filename_buf, "filename=\"{s}\"", .{basename[0..@min(basename.len, 1024 - 32)]}) catch "",
-                                );
-                            }
-                        }
+                if (this.blob.getFileName()) |filename| {
+                    const basename = std.fs.path.basename(filename);
+                    if (basename.len > 0) {
+                        var filename_buf: [1024]u8 = undefined;
+
+                        resp.writeHeader(
+                            "content-disposition",
+                            std.fmt.bufPrint(&filename_buf, "filename=\"{s}\"", .{basename[0..@min(basename.len, 1024 - 32)]}) catch "",
+                        );
                     }
                 }
             }
 
-            if (this.needs_content_length) {
+            if (this.flags.needs_content_length) {
                 resp.writeHeaderInt("content-length", size);
-                this.needs_content_length = false;
+                this.flags.needs_content_length = false;
             }
 
             if (needs_content_range) {
@@ -2780,7 +2774,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                         .{ this.sendfile.offset, this.sendfile.offset + (this.sendfile.remain -| 1) },
                     ) catch "bytes */*",
                 );
-                this.needs_content_range = false;
+                this.flags.needs_content_range = false;
             }
         }
 
@@ -2794,7 +2788,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                     bytes.len,
                     this.shouldCloseConnection(),
                 )) {
-                    this.has_marked_pending = true;
+                    this.flags.has_marked_pending = true;
                     resp.onWritable(*RequestContext, onWritableBytes, this);
                     // given a blob, we might not have set an abort handler yet
                     this.setAbortHandler();
@@ -2817,8 +2811,8 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
             std.debug.assert(this.resp == resp);
 
-            this.is_waiting_body = last == false;
-            if (this.aborted or this.has_marked_complete) return;
+            this.flags.is_waiting_body = last == false;
+            if (this.flags.aborted or this.flags.has_marked_complete) return;
 
             if (this.request_body != null) {
                 var body = this.request_body.?;
@@ -2874,7 +2868,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
                         const prev_len = bytes.items.len;
                         bytes.items.len = total;
                         var slice = bytes.items[prev_len..];
-                        @memcpy(slice.ptr, chunk.ptr, chunk.len);
+                        @memcpy(slice[0..chunk.len], chunk);
                         body.value = .{
                             .InternalBlob = .{
                                 .bytes = bytes.toManaged(this.allocator),
@@ -2898,7 +2892,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
 
         pub fn onStartStreamingRequestBody(this: *RequestContext) JSC.WebCore.DrainResult {
             ctxLog("onStartStreamingRequestBody", .{});
-            if (this.aborted) {
+            if (this.flags.aborted) {
                 return JSC.WebCore.DrainResult{
                     .aborted = {},
                 };
@@ -2928,7 +2922,7 @@ fn NewRequestContext(comptime ssl_enabled: bool, comptime debug_mode: bool, comp
             ctxLog("onStartBuffering", .{});
             // TODO: check if is someone calling onStartBuffering other than onStartBufferingCallback
             // if is not, this should be removed and only keep protect + setAbortHandler
-            if (this.is_transfer_encoding == false and this.request_body_content_len == 0) {
+            if (this.flags.is_transfer_encoding == false and this.request_body_content_len == 0) {
                 // no content-length or 0 content-length
                 // no transfer-encoding
                 if (this.request_body != null) {
@@ -3200,7 +3194,7 @@ pub const WebSocketServer = struct {
                     globalObject.throwInvalidArguments("websocket expects maxPayloadLength to be an integer", .{});
                     return null;
                 }
-                server.maxPayloadLength = @intCast(u32, @truncate(i33, @max(value.toInt64(), 0)));
+                server.maxPayloadLength = @intCast(u32, @max(value.toInt64(), 0));
             }
         }
         if (object.get(globalObject, "idleTimeout")) |value| {
@@ -3220,7 +3214,7 @@ pub const WebSocketServer = struct {
                     return null;
                 }
 
-                server.backpressureLimit = @intCast(u32, @truncate(i33, @max(value.toInt64(), 0)));
+                server.backpressureLimit = @intCast(u32, @max(value.toInt64(), 0));
             }
         }
         // if (object.get(globalObject, "sendPings")) |value| {
@@ -3366,7 +3360,7 @@ pub const ServerWebSocket = struct {
         opcode: uws.Opcode,
     ) void {
         log("onMessage({d}): {s}", .{
-            @enumToInt(opcode),
+            @intFromEnum(opcode),
             message,
         });
         const onMessageHandler = this.handler.onMessage;
@@ -3566,11 +3560,6 @@ pub const ServerWebSocket = struct {
         if (message_value.asArrayBuffer(globalThis)) |array_buffer| {
             const buffer = array_buffer.slice();
 
-            if (buffer.len == 0) {
-                globalThis.throw("publish requires a non-empty message", .{});
-                return .zero;
-            }
-
             const result = if (!publish_to_self)
                 this.websocket.publish(topic_slice.slice(), buffer, .binary, compress)
             else
@@ -3586,9 +3575,6 @@ pub const ServerWebSocket = struct {
         {
             var string_slice = message_value.toSlice(globalThis, bun.default_allocator);
             defer string_slice.deinit();
-            if (string_slice.len == 0) {
-                return JSValue.jsNumber(0);
-            }
 
             const buffer = string_slice.slice();
 
@@ -3640,10 +3626,6 @@ pub const ServerWebSocket = struct {
 
         var topic_slice = topic_value.toSlice(globalThis, bun.default_allocator);
         defer topic_slice.deinit();
-        if (topic_slice.len == 0) {
-            globalThis.throw("publishText requires a non-empty topic", .{});
-            return .zero;
-        }
 
         const compress = args.len > 1 and compress_value.toBoolean();
 
@@ -3654,9 +3636,6 @@ pub const ServerWebSocket = struct {
 
         var string_slice = message_value.toSlice(globalThis, bun.default_allocator);
         defer string_slice.deinit();
-        if (string_slice.len == 0) {
-            return JSValue.jsNumber(0);
-        }
 
         const buffer = string_slice.slice();
 
@@ -3721,10 +3700,6 @@ pub const ServerWebSocket = struct {
         };
         const buffer = array_buffer.slice();
 
-        if (buffer.len == 0) {
-            return JSC.JSValue.jsNumber(0);
-        }
-
         const result = if (!publish_to_self)
             this.websocket.publish(topic_slice.slice(), buffer, .binary, compress)
         else
@@ -3889,10 +3864,6 @@ pub const ServerWebSocket = struct {
         }
 
         if (message_value.asArrayBuffer(globalThis)) |buffer| {
-            if (buffer.len == 0) {
-                return JSValue.jsNumber(0);
-            }
-
             switch (this.websocket.send(buffer.slice(), .binary, compress, true)) {
                 .backpressure => {
                     log("send() backpressure ({d} bytes)", .{buffer.len});
@@ -3912,9 +3883,6 @@ pub const ServerWebSocket = struct {
         {
             var string_slice = message_value.toSlice(globalThis, bun.default_allocator);
             defer string_slice.deinit();
-            if (string_slice.len == 0) {
-                return JSValue.jsNumber(0);
-            }
 
             const buffer = string_slice.slice();
             switch (this.websocket.send(buffer, .text, compress, true)) {
@@ -3966,9 +3934,6 @@ pub const ServerWebSocket = struct {
 
         var string_slice = message_value.toSlice(globalThis, bun.default_allocator);
         defer string_slice.deinit();
-        if (string_slice.len == 0) {
-            return JSValue.jsNumber(0);
-        }
 
         const buffer = string_slice.slice();
         switch (this.websocket.send(buffer, .text, compress, true)) {
@@ -4000,9 +3965,6 @@ pub const ServerWebSocket = struct {
 
         var string_slice = message_str.toSlice(globalThis, bun.default_allocator);
         defer string_slice.deinit();
-        if (string_slice.len == 0) {
-            return JSValue.jsNumber(0);
-        }
 
         const buffer = string_slice.slice();
         switch (this.websocket.send(buffer, .text, compress, true)) {
@@ -4049,10 +4011,6 @@ pub const ServerWebSocket = struct {
             return .zero;
         };
 
-        if (buffer.len == 0) {
-            return JSValue.jsNumber(0);
-        }
-
         switch (this.websocket.send(buffer.slice(), .binary, compress, true)) {
             .backpressure => {
                 log("sendBinary() backpressure ({d} bytes)", .{buffer.len});
@@ -4082,10 +4040,6 @@ pub const ServerWebSocket = struct {
 
         const buffer = array_buffer.slice();
 
-        if (buffer.len == 0) {
-            return JSValue.jsNumber(0);
-        }
-
         switch (this.websocket.send(buffer, .binary, compress, true)) {
             .backpressure => {
                 log("sendBinary() backpressure ({d} bytes)", .{buffer.len});
@@ -4422,36 +4376,23 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
 
             const compress = (compress_value orelse JSValue.jsBoolean(true)).toBoolean();
 
-            if (message_value.isEmptyOrUndefinedOrNull()) {
-                JSC.JSError(this.vm.allocator, "publish requires a non-empty message", .{}, globalThis, exception);
-                return .zero;
-            }
-
             if (message_value.asArrayBuffer(globalThis)) |buffer| {
-                if (buffer.len == 0) {
-                    JSC.JSError(this.vm.allocator, "publish requires a non-empty message", .{}, globalThis, exception);
-                    return .zero;
-                }
-
                 return JSValue.jsNumber(
                     // if 0, return 0
                     // else return number of bytes sent
-                    @as(i32, @boolToInt(uws.AnyWebSocket.publishWithOptions(ssl_enabled, app, topic_slice.slice(), buffer.slice(), .binary, compress))) * @intCast(i32, @truncate(u31, buffer.len)),
+                    @as(i32, @intFromBool(uws.AnyWebSocket.publishWithOptions(ssl_enabled, app, topic_slice.slice(), buffer.slice(), .binary, compress))) * @intCast(i32, @truncate(u31, buffer.len)),
                 );
             }
 
             {
                 var string_slice = message_value.toSlice(globalThis, bun.default_allocator);
                 defer string_slice.deinit();
-                if (string_slice.len == 0) {
-                    return JSValue.jsNumber(0);
-                }
 
                 const buffer = string_slice.slice();
                 return JSValue.jsNumber(
                     // if 0, return 0
                     // else return number of bytes sent
-                    @as(i32, @boolToInt(uws.AnyWebSocket.publishWithOptions(ssl_enabled, app, topic_slice.slice(), buffer, .text, compress))) * @intCast(i32, @truncate(u31, buffer.len)),
+                    @as(i32, @intFromBool(uws.AnyWebSocket.publishWithOptions(ssl_enabled, app, topic_slice.slice(), buffer, .text, compress))) * @intCast(i32, @truncate(u31, buffer.len)),
                 );
             }
 
@@ -4484,11 +4425,11 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
             }
 
             var upgrader = bun.cast(*RequestContext, request.upgrader.?);
-            if (upgrader.aborted or upgrader.resp == null) {
+            if (upgrader.flags.aborted or upgrader.resp == null) {
                 return JSC.jsBoolean(false);
             }
 
-            if (upgrader.upgrade_context == null or @ptrToInt(upgrader.upgrade_context) == std.math.maxInt(usize)) {
+            if (upgrader.upgrade_context == null or @intFromPtr(upgrader.upgrade_context) == std.math.maxInt(usize)) {
                 return JSC.jsBoolean(false);
             }
             const resp = upgrader.resp.?;
@@ -4582,7 +4523,7 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
             // See https://github.com/oven-sh/bun/issues/1339
 
             // obviously invalid pointer marks it as used
-            upgrader.upgrade_context = @intToPtr(*uws.uws_socket_context_s, std.math.maxInt(usize));
+            upgrader.upgrade_context = @ptrFromInt(*uws.uws_socket_context_s, std.math.maxInt(usize));
             request.upgrader = null;
 
             resp.clearAborted();
@@ -4947,7 +4888,7 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
                         if (reason.len == 0) {
                             break;
                         }
-                        @memcpy(output_buf[written..].ptr, reason.ptr, reason.len);
+                        @memcpy(output_buf[written..][0..reason.len], reason);
                         written += reason.len;
                     }
 
@@ -4958,7 +4899,7 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
                         if (reason.len > 0) {
                             output_buf[written..][0.." via ".len].* = " via ".*;
                             written += " via ".len;
-                            @memcpy(output_buf[written..].ptr, reason.ptr, reason.len);
+                            @memcpy(output_buf[written..][0..reason.len], reason);
                             written += reason.len;
                         }
                     }
@@ -4970,7 +4911,7 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
                         if (reason.len > 0) {
                             output_buf[written..][0] = ' ';
                             written += 1;
-                            @memcpy(output_buf[written..].ptr, reason.ptr, reason.len);
+                            @memcpy(output_buf[written..][0..reason.len], reason);
                             written += reason.len;
                         }
                     }
@@ -5110,7 +5051,7 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
             };
 
             if (comptime debug_mode) {
-                ctx.is_web_browser_navigation = brk: {
+                ctx.flags.is_web_browser_navigation = brk: {
                     if (ctx.req.header("sec-fetch-dest")) |fetch_dest| {
                         if (strings.eqlComptime(fetch_dest, "document")) {
                             break :brk true;
@@ -5141,8 +5082,8 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
                 }
 
                 ctx.request_body_content_len = req_len;
-                ctx.is_transfer_encoding = req.header("transfer-encoding") != null;
-                if (req_len > 0 or ctx.is_transfer_encoding) {
+                ctx.flags.is_transfer_encoding = req.header("transfer-encoding") != null;
+                if (req_len > 0 or ctx.flags.is_transfer_encoding) {
                     // we defer pre-allocating the body until we receive the first chunk
                     // that way if the client is lying about how big the body is or the client aborts
                     // we don't waste memory
@@ -5154,7 +5095,7 @@ pub fn NewServer(comptime ssl_enabled_: bool, comptime debug_mode_: bool) type {
                             .onStartStreaming = RequestContext.onStartStreamingRequestBodyCallback,
                         },
                     };
-                    ctx.is_waiting_body = true;
+                    ctx.flags.is_waiting_body = true;
                     resp.onData(*RequestContext, RequestContext.onBufferedBodyChunk, ctx);
                 }
             }
diff --git a/src/bun.js/api/sockets.classes.ts b/src/bun.js/api/sockets.classes.ts
index da07741a3..5bd073b9f 100644
--- a/src/bun.js/api/sockets.classes.ts
+++ b/src/bun.js/api/sockets.classes.ts
@@ -15,10 +15,17 @@ function generate(ssl) {
       authorized: {
         getter: "getAuthorized",
       },
+      alpnProtocol: {
+        getter: "getALPNProtocol",
+      },
       write: {
         fn: "write",
         length: 3,
       },
+      upgradeTLS: {
+        fn: "upgradeTLS",
+        length: 1,
+      },
       end: {
         fn: "end",
         length: 3,
@@ -82,6 +89,11 @@ function generate(ssl) {
         fn: "reload",
         length: 1,
       },
+
+      setServername: {
+        fn: "setServername",
+        length: 1,
+      },
     },
     finalize: true,
     construct: true,
diff --git a/src/bun.js/base.zig b/src/bun.js/base.zig
index 038f7f38b..4a1249b5d 100644
--- a/src/bun.js/base.zig
+++ b/src/bun.js/base.zig
@@ -1291,23 +1291,27 @@ pub fn NewClassWithInstanceType(
 
         const static_properties: [property_names.len + 1]js.JSStaticValue = brk: {
             var props: [property_names.len + 1]js.JSStaticValue = undefined;
-            std.mem.set(
-                js.JSStaticValue,
+            @memset(
                 &props,
                 js.JSStaticValue{
-                    .name = @intToPtr([*c]const u8, 0),
+                    .name = @ptrFromInt([*c]const u8, 0),
                     .getProperty = null,
                     .setProperty = null,
                     .attributes = js.JSPropertyAttributes.kJSPropertyAttributeNone,
                 },
             );
+            if (property_name_literals.len > 0 and @TypeOf(property_name_literals[0]) == [:0]const u8) {
+                @compileError("@typeInfo() struct field names are null-terminated");
+            }
             for (property_name_literals, 0..) |lit, i| {
                 props[i] = brk2: {
                     var static_prop = JSC.C.JSStaticValue{
-                        .name = lit.ptr[0..lit.len :0],
+                        // TODO: update when @typeInfo struct field names are sentinel terminated
+                        // https://github.com/ziglang/zig/issues/16072
+                        .name = lit ++ .{0},
                         .getProperty = null,
                         .setProperty = null,
-                        .attributes = @intToEnum(js.JSPropertyAttributes, 0),
+                        .attributes = @enumFromInt(js.JSPropertyAttributes, 0),
                     };
                     static_prop.getProperty = StaticProperty(i).getter;
 
@@ -1396,14 +1400,14 @@ pub fn NewClassWithInstanceType(
                             const ctxfn = CtxField.rfn;
                             const Func: std.builtin.Type.Fn = @typeInfo(@TypeOf(if (@typeInfo(@TypeOf(ctxfn)) == .Pointer) ctxfn.* else ctxfn)).Fn;
 
-                            var attributes: c_uint = @enumToInt(js.JSPropertyAttributes.kJSPropertyAttributeNone);
+                            var attributes: c_uint = @intFromEnum(js.JSPropertyAttributes.kJSPropertyAttributeNone);
 
                             if (comptime is_read_only or hasReadOnly(@TypeOf(CtxField))) {
-                                attributes |= @enumToInt(js.JSPropertyAttributes.kJSPropertyAttributeReadOnly);
+                                attributes |= @intFromEnum(js.JSPropertyAttributes.kJSPropertyAttributeReadOnly);
                             }
 
                             if (comptime hasEnumerable(@TypeOf(CtxField)) and !CtxField.enumerable) {
-                                attributes |= @enumToInt(js.JSPropertyAttributes.kJSPropertyAttributeDontEnum);
+                                attributes |= @intFromEnum(js.JSPropertyAttributes.kJSPropertyAttributeDontEnum);
                             }
 
                             const PointerType = comptime brk: {
@@ -1419,7 +1423,7 @@ pub fn NewClassWithInstanceType(
                                     PointerType,
                                     if (@typeInfo(@TypeOf(ctxfn)) == .Pointer) ctxfn.* else ctxfn,
                                 ).rfn,
-                                .attributes = @intToEnum(js.JSPropertyAttributes, attributes),
+                                .attributes = @enumFromInt(js.JSPropertyAttributes, attributes),
                             };
 
                             count += 1;
@@ -1438,12 +1442,12 @@ pub fn NewClassWithInstanceType(
                             def.hasInstance = &staticFunctions.hasInstance;
                         } else {
                             const attributes: js.JSPropertyAttributes = brk: {
-                                var base = @enumToInt(js.JSPropertyAttributes.kJSPropertyAttributeNone);
+                                var base = @intFromEnum(js.JSPropertyAttributes.kJSPropertyAttributeNone);
 
                                 if (is_read_only)
-                                    base |= @enumToInt(js.JSPropertyAttributes.kJSPropertyAttributeReadOnly);
+                                    base |= @intFromEnum(js.JSPropertyAttributes.kJSPropertyAttributeReadOnly);
 
-                                break :brk @intToEnum(js.JSPropertyAttributes, base);
+                                break :brk @enumFromInt(js.JSPropertyAttributes, base);
                             };
 
                             __static_functions[count] = js.JSStaticFunction{
@@ -1834,7 +1838,7 @@ pub const ArrayBuffer = extern struct {
                 this.ptr,
                 this.byte_len,
                 MarkedArrayBuffer_deallocator,
-                @intToPtr(*anyopaque, @ptrToInt(&bun.default_allocator)),
+                @ptrFromInt(*anyopaque, @intFromPtr(&bun.default_allocator)),
                 exception,
             ));
         }
@@ -1845,7 +1849,7 @@ pub const ArrayBuffer = extern struct {
             this.ptr,
             this.byte_len,
             MarkedArrayBuffer_deallocator,
-            @intToPtr(*anyopaque, @ptrToInt(&bun.default_allocator)),
+            @ptrFromInt(*anyopaque, @intFromPtr(&bun.default_allocator)),
             exception,
         ));
     }
@@ -2068,7 +2072,7 @@ pub const RefString = struct {
     pub const Callback = fn (ctx: *anyopaque, str: *RefString) void;
 
     pub fn computeHash(input: []const u8) u32 {
-        return std.hash.XxHash32.hash(input);
+        return std.hash.XxHash32.hash(0, input);
     }
 
     pub fn slice(this: *RefString) []const u8 {
@@ -3355,7 +3359,7 @@ pub const FilePoll = struct {
     const DNSResolver = JSC.DNS.DNSResolver;
     const GetAddrInfoRequest = JSC.DNS.GetAddrInfoRequest;
     const Deactivated = opaque {
-        pub var owner: Owner = Owner.init(@intToPtr(*Deactivated, @as(usize, 0xDEADBEEF)));
+        pub var owner: Owner = Owner.init(@ptrFromInt(*Deactivated, @as(usize, 0xDEADBEEF)));
     };
 
     pub const Owner = bun.TaggedPointerUnion(.{
@@ -3610,7 +3614,7 @@ pub const FilePoll = struct {
             return;
         this.flags.insert(.disable);
 
-        vm.uws_event_loop.?.active -= @as(u32, @boolToInt(this.flags.contains(.has_incremented_poll_count)));
+        vm.uws_event_loop.?.active -= @as(u32, @intFromBool(this.flags.contains(.has_incremented_poll_count)));
     }
 
     pub fn enableKeepingProcessAlive(this: *FilePoll, vm: *JSC.VirtualMachine) void {
@@ -3618,7 +3622,7 @@ pub const FilePoll = struct {
             return;
         this.flags.remove(.disable);
 
-        vm.uws_event_loop.?.active += @as(u32, @boolToInt(this.flags.contains(.has_incremented_poll_count)));
+        vm.uws_event_loop.?.active += @as(u32, @intFromBool(this.flags.contains(.has_incremented_poll_count)));
     }
 
     pub fn canActivate(this: *const FilePoll) bool {
@@ -3628,16 +3632,16 @@ pub const FilePoll = struct {
     /// Only intended to be used from EventLoop.Pollable
     pub fn deactivate(this: *FilePoll, loop: *uws.Loop) void {
         std.debug.assert(this.flags.contains(.has_incremented_poll_count));
-        loop.num_polls -= @as(i32, @boolToInt(this.flags.contains(.has_incremented_poll_count)));
-        loop.active -|= @as(u32, @boolToInt(!this.flags.contains(.disable) and this.flags.contains(.has_incremented_poll_count)));
+        loop.num_polls -= @as(i32, @intFromBool(this.flags.contains(.has_incremented_poll_count)));
+        loop.active -|= @as(u32, @intFromBool(!this.flags.contains(.disable) and this.flags.contains(.has_incremented_poll_count)));
 
         this.flags.remove(.has_incremented_poll_count);
     }
 
     /// Only intended to be used from EventLoop.Pollable
     pub fn activate(this: *FilePoll, loop: *uws.Loop) void {
-        loop.num_polls += @as(i32, @boolToInt(!this.flags.contains(.has_incremented_poll_count)));
-        loop.active += @as(u32, @boolToInt(!this.flags.contains(.disable) and !this.flags.contains(.has_incremented_poll_count)));
+        loop.num_polls += @as(i32, @intFromBool(!this.flags.contains(.has_incremented_poll_count)));
+        loop.active += @as(u32, @intFromBool(!this.flags.contains(.disable) and !this.flags.contains(.has_incremented_poll_count)));
 
         this.flags.insert(.has_incremented_poll_count);
     }
@@ -3738,7 +3742,7 @@ pub const FilePoll = struct {
                 else => unreachable,
             };
 
-            var event = linux.epoll_event{ .events = flags, .data = .{ .u64 = @ptrToInt(Pollable.init(this).ptr()) } };
+            var event = linux.epoll_event{ .events = flags, .data = .{ .u64 = @intFromPtr(Pollable.init(this).ptr()) } };
 
             const ctl = linux.epoll_ctl(
                 watcher_fd,
@@ -3759,7 +3763,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_READ,
                     .data = 0,
                     .fflags = 0,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_ADD | one_shot_flag,
                     .ext = .{ this.generation_number, 0 },
                 },
@@ -3768,7 +3772,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_WRITE,
                     .data = 0,
                     .fflags = 0,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_ADD | one_shot_flag,
                     .ext = .{ this.generation_number, 0 },
                 },
@@ -3777,7 +3781,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_PROC,
                     .data = 0,
                     .fflags = std.c.NOTE_EXIT,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_ADD | one_shot_flag,
                     .ext = .{ this.generation_number, 0 },
                 },
@@ -3786,7 +3790,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_MACHPORT,
                     .data = 0,
                     .fflags = 0,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_ADD | one_shot_flag,
                     .ext = .{ this.generation_number, 0 },
                 },
@@ -3911,7 +3915,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_READ,
                     .data = 0,
                     .fflags = 0,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_DELETE,
                     .ext = .{ 0, 0 },
                 },
@@ -3920,7 +3924,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_MACHPORT,
                     .data = 0,
                     .fflags = 0,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_DELETE,
                     .ext = .{ 0, 0 },
                 },
@@ -3929,7 +3933,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_WRITE,
                     .data = 0,
                     .fflags = 0,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_DELETE,
                     .ext = .{ 0, 0 },
                 },
@@ -3938,7 +3942,7 @@ pub const FilePoll = struct {
                     .filter = std.os.system.EVFILT_PROC,
                     .data = 0,
                     .fflags = std.c.NOTE_EXIT,
-                    .udata = @ptrToInt(Pollable.init(this).ptr()),
+                    .udata = @intFromPtr(Pollable.init(this).ptr()),
                     .flags = std.c.EV_DELETE,
                     .ext = .{ 0, 0 },
                 },
@@ -3973,7 +3977,7 @@ pub const FilePoll = struct {
 
             const errno = std.c.getErrno(rc);
             switch (rc) {
-                std.math.minInt(@TypeOf(rc))...-1 => return JSC.Maybe(void).errnoSys(@enumToInt(errno), .kevent).?,
+                std.math.minInt(@TypeOf(rc))...-1 => return JSC.Maybe(void).errnoSys(@intFromEnum(errno), .kevent).?,
                 else => {},
             }
         } else {
diff --git a/src/bun.js/bindings/BunString.cpp b/src/bun.js/bindings/BunString.cpp
index f737342f4..21541d711 100644
--- a/src/bun.js/bindings/BunString.cpp
+++ b/src/bun.js/bindings/BunString.cpp
@@ -7,6 +7,11 @@
 #include "GCDefferalContext.h"
 using namespace JSC;
 
+extern "C" bool Bun__WTFStringImpl__hasPrefix(const WTF::StringImpl* impl, const char* bytes, size_t length)
+{
+    return impl->startsWith(bytes, length);
+}
+
 extern "C" void Bun__WTFStringImpl__deref(WTF::StringImpl* impl)
 {
     impl->deref();
@@ -81,31 +86,69 @@ BunString toString(JSC::JSGlobalObject* globalObject, JSValue value)
     return fromJS(globalObject, value);
 }
 
+BunString toStringRef(JSC::JSGlobalObject* globalObject, JSValue value)
+{
+    auto str = value.toWTFString(globalObject);
+    if (str.isEmpty()) {
+        return { BunStringTag::Empty };
+    }
+
+    str.impl()->ref();
+
+    return { BunStringTag::WTFStringImpl, { .wtf = str.impl() } };
+}
+
 BunString toString(WTF::String& wtfString)
 {
-    if (wtfString.length() == 0)
+    if (wtfString.isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
 }
 BunString toString(const WTF::String& wtfString)
 {
-    if (wtfString.length() == 0)
+    if (wtfString.isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
 }
 BunString toString(WTF::StringImpl* wtfString)
 {
-    if (wtfString->length() == 0)
+    if (wtfString->isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString } };
 }
 
+BunString toStringRef(WTF::String& wtfString)
+{
+    if (wtfString.isEmpty())
+        return { BunStringTag::Empty };
+
+    wtfString.impl()->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
+}
+BunString toStringRef(const WTF::String& wtfString)
+{
+    if (wtfString.isEmpty())
+        return { BunStringTag::Empty };
+
+    wtfString.impl()->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
+}
+BunString toStringRef(WTF::StringImpl* wtfString)
+{
+    if (wtfString->isEmpty())
+        return { BunStringTag::Empty };
+
+    wtfString->ref();
+
+    return { BunStringTag::WTFStringImpl, { .wtf = wtfString } };
+}
+
 BunString fromString(WTF::String& wtfString)
 {
-    if (wtfString.length() == 0)
+    if (wtfString.isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
@@ -113,7 +156,7 @@ BunString fromString(WTF::String& wtfString)
 
 BunString fromString(WTF::StringImpl* wtfString)
 {
-    if (wtfString->length() == 0)
+    if (wtfString->isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString } };
@@ -126,6 +169,29 @@ extern "C" JSC::EncodedJSValue BunString__toJS(JSC::JSGlobalObject* globalObject
     return JSValue::encode(Bun::toJS(globalObject, *bunString));
 }
 
+extern "C" BunString BunString__fromUTF16Unitialized(size_t length)
+{
+    unsigned utf16Length = length;
+    UChar* ptr;
+    auto impl = WTF::StringImpl::createUninitialized(utf16Length, ptr);
+    if (UNLIKELY(!ptr))
+        return { BunStringTag::Dead };
+
+    impl->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = &impl.leakRef() } };
+}
+
+extern "C" BunString BunString__fromLatin1Unitialized(size_t length)
+{
+    unsigned latin1Length = length;
+    LChar* ptr;
+    auto impl = WTF::StringImpl::createUninitialized(latin1Length, ptr);
+    if (UNLIKELY(!ptr))
+        return { BunStringTag::Dead };
+    impl->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = &impl.leakRef() } };
+}
+
 extern "C" BunString BunString__fromUTF8(const char* bytes, size_t length)
 {
     if (simdutf::validate_utf8(bytes, length)) {
diff --git a/src/bun.js/bindings/CommonJSModuleRecord.cpp b/src/bun.js/bindings/CommonJSModuleRecord.cpp
index 1cee1091b..8adba197c 100644
--- a/src/bun.js/bindings/CommonJSModuleRecord.cpp
+++ b/src/bun.js/bindings/CommonJSModuleRecord.cpp
@@ -59,458 +59,914 @@
 
 #include <JavaScriptCore/DFGAbstractHeap.h>
 #include <JavaScriptCore/Completion.h>
+#include "ModuleLoader.h"
 #include <JavaScriptCore/JSMap.h>
 
 #include <JavaScriptCore/JSMapInlines.h>
 #include <JavaScriptCore/GetterSetter.h>
 #include "ZigSourceProvider.h"
+#include "JavaScriptCore/FunctionPrototype.h"
+#include "CommonJSModuleRecord.h"
+#include <JavaScriptCore/JSModuleNamespaceObject.h>
+#include <JavaScriptCore/JSSourceCode.h>
+#include <JavaScriptCore/LazyPropertyInlines.h>
 
 namespace Bun {
 using namespace JSC;
 
-class JSCommonJSModule final : public JSC::JSNonFinalObject {
-public:
-    using Base = JSC::JSNonFinalObject;
-    static constexpr unsigned StructureFlags = Base::StructureFlags | JSC::OverridesPut;
+JSC_DECLARE_HOST_FUNCTION(jsFunctionRequireCommonJS);
 
-    mutable JSC::WriteBarrier<JSC::Unknown> m_exportsObject;
-    mutable JSC::WriteBarrier<JSC::JSString> m_id;
+static bool canPerformFastEnumeration(Structure* s)
+{
+    if (s->typeInfo().overridesGetOwnPropertySlot())
+        return false;
+    if (s->typeInfo().overridesAnyFormOfGetOwnPropertyNames())
+        return false;
+    if (hasIndexedProperties(s->indexingType()))
+        return false;
+    if (s->hasAnyKindOfGetterSetterProperties())
+        return false;
+    if (s->isUncacheableDictionary())
+        return false;
+    if (s->hasUnderscoreProtoPropertyExcludingOriginalProto())
+        return false;
+    return true;
+}
 
-    void finishCreation(JSC::VM& vm, JSC::JSValue exportsObject, JSC::JSString* id, JSC::JSString* filename, JSC::JSString* dirname, JSC::JSValue requireFunction)
-    {
-        Base::finishCreation(vm);
-        ASSERT(inherits(vm, info()));
-        m_exportsObject.set(vm, this, exportsObject);
-        m_id.set(vm, this, id);
+static bool evaluateCommonJSModuleOnce(JSC::VM& vm, Zig::GlobalObject* globalObject, JSCommonJSModule* moduleObject, JSString* dirname, JSString* filename, WTF::NakedPtr<Exception>& exception)
+{
+    JSC::Structure* thisObjectStructure = globalObject->commonJSFunctionArgumentsStructure();
+    JSC::JSObject* thisObject = JSC::constructEmptyObject(
+        vm,
+        thisObjectStructure);
+    thisObject->putDirectOffset(
+        vm,
+        0,
+        moduleObject);
 
-        this->putDirectOffset(
-            vm,
-            0,
-            exportsObject);
+    thisObject->putDirectOffset(
+        vm,
+        1,
+        dirname);
 
-        this->putDirectOffset(
-            vm,
-            1,
-            id);
+    thisObject->putDirectOffset(
+        vm,
+        2,
+        filename);
 
-        this->putDirectOffset(
-            vm,
-            2,
-            filename);
+    moduleObject->hasEvaluated = true;
+    globalObject->m_BunCommonJSModuleValue.set(vm, globalObject, thisObject);
 
-        this->putDirectOffset(
-            vm,
-            3,
-            jsBoolean(false));
+    JSValue empty = JSC::evaluate(globalObject, moduleObject->sourceCode.get()->sourceCode(), thisObject, exception);
+    moduleObject->sourceCode.clear();
 
-        this->putDirectOffset(
-            vm,
-            4,
-            dirname);
+    return exception.get() == nullptr;
+}
 
-        this->putDirectOffset(
-            vm,
-            5,
-            jsUndefined());
+JSC_DEFINE_HOST_FUNCTION(jsFunctionLoadModule, (JSGlobalObject * lexicalGlobalObject, CallFrame* callframe))
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
+    JSCommonJSModule* moduleObject = jsDynamicCast<JSCommonJSModule*>(callframe->argument(0));
+    if (!moduleObject) {
+        RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(true)));
+    }
+
+    if (moduleObject->hasEvaluated || !moduleObject->sourceCode) {
+        RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(true)));
     }
 
-    static JSC::Structure* createStructure(
+    WTF::NakedPtr<Exception> exception;
+
+    evaluateCommonJSModuleOnce(
+        globalObject->vm(),
+        jsCast<Zig::GlobalObject*>(globalObject),
+        moduleObject,
+        moduleObject->m_dirname.get(),
+        moduleObject->m_filename.get(),
+        exception);
+
+    if (exception.get()) {
+        // On error, remove the module from the require map/
+        // so that it can be re-evaluated on the next require.
+        globalObject->requireMap()->remove(globalObject, moduleObject->id());
+
+        throwException(globalObject, throwScope, exception.get());
+        exception.clear();
+        return JSValue::encode({});
+    }
+
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(true)));
+}
+
+JSC_DEFINE_HOST_FUNCTION(requireResolvePathsFunction, (JSGlobalObject * globalObject, CallFrame* callframe))
+{
+    return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr, 0));
+}
+
+static const HashTableValue RequireResolveFunctionPrototypeValues[] = {
+    { "paths"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, requireResolvePathsFunction, 1 } },
+};
+
+class RequireResolveFunctionPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static RequireResolveFunctionPrototype* create(
         JSC::JSGlobalObject* globalObject)
     {
         auto& vm = globalObject->vm();
-        JSC::Structure* structure = JSC::Structure::create(
-            vm,
-            globalObject,
-            globalObject->objectPrototype(),
-            JSC::TypeInfo(JSC::ObjectType, JSCommonJSModule::StructureFlags),
-            JSCommonJSModule::info(),
-            JSC::NonArray,
-            6);
 
-        JSC::PropertyOffset offset;
-        auto clientData = WebCore::clientData(vm);
+        auto* structure = RequireResolveFunctionPrototype::createStructure(vm, globalObject, globalObject->functionPrototype());
+        RequireResolveFunctionPrototype* prototype = new (NotNull, JSC::allocateCell<RequireResolveFunctionPrototype>(vm)) RequireResolveFunctionPrototype(vm, structure);
+        prototype->finishCreation(vm);
+        return prototype;
+    }
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "exports"_s),
-            0,
-            offset);
+    DECLARE_INFO;
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "id"_s),
-            0,
-            offset);
+    RequireResolveFunctionPrototype(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : Base(vm, structure)
+    {
+    }
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "filename"_s),
-            0,
-            offset);
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        return &vm.plainObjectSpace();
+    }
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "loaded"_s),
-            0,
-            offset);
+    void finishCreation(JSC::VM& vm);
+};
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "path"_s),
-            0,
-            offset);
+static const HashTableValue RequireFunctionPrototypeValues[] = {
+    { "cache"_s, static_cast<unsigned>(JSC::PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, Zig::jsRequireCacheGetter, Zig::jsRequireCacheSetter } },
+};
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "require"_s),
-            0,
-            offset);
+class RequireFunctionPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static RequireFunctionPrototype* create(
+        JSC::JSGlobalObject* globalObject)
+    {
+        auto& vm = globalObject->vm();
 
-        return structure;
+        auto* structure = RequireFunctionPrototype::createStructure(vm, globalObject, globalObject->functionPrototype());
+        RequireFunctionPrototype* prototype = new (NotNull, JSC::allocateCell<RequireFunctionPrototype>(vm)) RequireFunctionPrototype(vm, structure);
+        prototype->finishCreation(vm);
+
+        JSFunction* resolveFunction = JSFunction::create(vm, moduleRequireResolveCodeGenerator(vm), globalObject->globalScope(), JSFunction::createStructure(vm, globalObject, RequireResolveFunctionPrototype::create(globalObject)));
+        prototype->putDirect(vm, JSC::Identifier::fromString(vm, "resolve"_s), resolveFunction, PropertyAttribute::Function | 0);
+
+        return prototype;
     }
 
-    static JSCommonJSModule* create(
+    RequireFunctionPrototype(
         JSC::VM& vm,
-        JSC::Structure* structure,
-        JSC::JSValue exportsObject,
-        JSC::JSString* id,
-        JSC::JSString* filename,
-        JSC::JSString* dirname,
-        JSC::JSValue requireFunction)
+        JSC::Structure* structure)
+        : Base(vm, structure)
     {
-        JSCommonJSModule* cell = new (NotNull, JSC::allocateCell<JSCommonJSModule>(vm)) JSCommonJSModule(vm, structure);
-        cell->finishCreation(vm, exportsObject, id, filename, dirname, requireFunction);
-        return cell;
     }
 
-    JSValue exportsObject()
+    DECLARE_INFO;
+
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
-        return m_exportsObject.get();
+        return &vm.plainObjectSpace();
     }
 
-    JSValue id()
+    void finishCreation(JSC::VM& vm)
     {
-        return m_id.get();
+        Base::finishCreation(vm);
+        ASSERT(inherits(vm, info()));
+
+        reifyStaticProperties(vm, info(), RequireFunctionPrototypeValues, *this);
+        JSC::JSFunction* requireDotMainFunction = JSFunction::create(
+            vm,
+            moduleMainCodeGenerator(vm),
+            globalObject()->globalScope());
+
+        this->putDirect(
+            vm,
+            JSC::Identifier::fromString(vm, "main"_s),
+            JSC::GetterSetter::create(vm, globalObject(), requireDotMainFunction, JSValue()),
+            PropertyAttribute::Builtin | PropertyAttribute::Accessor | PropertyAttribute::ReadOnly | 0);
+        this->putDirect(vm, JSC::Identifier::fromString(vm, "extensions"_s), constructEmptyObject(globalObject()), 0);
     }
+};
 
-    DECLARE_VISIT_CHILDREN;
+JSC_DEFINE_CUSTOM_GETTER(getterFilename, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
+    return JSValue::encode(thisObject->m_filename.get());
+}
+JSC_DEFINE_CUSTOM_GETTER(getterId, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
+    return JSValue::encode(thisObject->m_id.get());
+}
 
-    static bool put(
-        JSC::JSCell* cell,
-        JSC::JSGlobalObject* globalObject,
-        JSC::PropertyName propertyName,
-        JSC::JSValue value,
-        JSC::PutPropertySlot& slot)
-    {
+JSC_DEFINE_CUSTOM_GETTER(getterPath, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
+    return JSValue::encode(thisObject->m_id.get());
+}
 
-        auto& vm = globalObject->vm();
-        auto* clientData = WebCore::clientData(vm);
-        auto throwScope = DECLARE_THROW_SCOPE(vm);
+JSC_DEFINE_CUSTOM_SETTER(setterPath,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
 
-        if (propertyName == clientData->builtinNames().exportsPublicName()) {
-            JSCommonJSModule* thisObject = jsCast<JSCommonJSModule*>(cell);
-            ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    thisObject->m_id.set(globalObject->vm(), thisObject, JSValue::decode(value).toString(globalObject));
+    return true;
+}
 
-            // It will crash if we attempt to assign Object.defineProperty() result to a JSMap*.
-            if (UNLIKELY(slot.thisValue() != thisObject))
-                RELEASE_AND_RETURN(throwScope, JSObject::definePropertyOnReceiver(globalObject, propertyName, value, slot));
+extern "C" EncodedJSValue Resolver__propForRequireMainPaths(JSGlobalObject*);
 
-            JSValue prevValue = thisObject->m_exportsObject.get();
+JSC_DEFINE_CUSTOM_GETTER(getterPaths, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
 
-            // TODO: refactor this to not go through ESM path and we don't need to do this check.
-            // IF we do this on every call, it causes GC to happen in a place that it may not be able to.
-            // This breaks loading Bluebird in some cases, for example.
-            // We need to update the require map "live" because otherwise the code in Discord.js will break
-            // The bug is something to do with exception handling which causes GC to happen in the error path and then boom.
-            if (prevValue != value && (!prevValue.isCell() || !value.isCell() || prevValue.asCell()->type() != value.asCell()->type())) {
-                jsCast<Zig::GlobalObject*>(globalObject)->requireMap()->set(globalObject, thisObject->id(), value);
-            }
+    if (!thisObject->m_paths) {
+        JSValue paths = JSValue::decode(Resolver__propForRequireMainPaths(globalObject));
+        thisObject->m_paths.set(globalObject->vm(), thisObject, paths);
+    }
 
-            thisObject->m_exportsObject.set(vm, thisObject, value);
-        }
+    return JSValue::encode(thisObject->m_paths.get());
+}
 
-        RELEASE_AND_RETURN(throwScope, Base::put(cell, globalObject, propertyName, value, slot));
+JSC_DEFINE_CUSTOM_SETTER(setterPaths,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->m_paths.set(globalObject->vm(), thisObject, JSValue::decode(value));
+    return true;
+}
+
+JSC_DEFINE_CUSTOM_SETTER(setterFilename,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->m_filename.set(globalObject->vm(), thisObject, JSValue::decode(value).toString(globalObject));
+    return true;
+}
+
+JSC_DEFINE_CUSTOM_SETTER(setterId,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->m_id.set(globalObject->vm(), thisObject, JSValue::decode(value).toString(globalObject));
+    return true;
+}
+
+static JSValue createLoaded(VM& vm, JSObject* object)
+{
+    JSCommonJSModule* cjs = jsCast<JSCommonJSModule*>(object);
+    return jsBoolean(cjs->hasEvaluated);
+}
+static JSValue createParent(VM& vm, JSObject* object)
+{
+    return jsUndefined();
+}
+static JSValue createChildren(VM& vm, JSObject* object)
+{
+    return constructEmptyArray(object->globalObject(), nullptr, 0);
+}
+
+static const struct HashTableValue JSCommonJSModulePrototypeTableValues[] = {
+    { "children"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback | PropertyAttribute::DontEnum | 0), NoIntrinsic, { HashTableValue::LazyPropertyType, createChildren } },
+    { "filename"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterFilename, setterFilename } },
+    { "id"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterId, setterId } },
+    { "loaded"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback | PropertyAttribute::DontEnum | 0), NoIntrinsic, { HashTableValue::LazyPropertyType, createLoaded } },
+    { "parent"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback | PropertyAttribute::DontEnum | 0), NoIntrinsic, { HashTableValue::LazyPropertyType, createParent } },
+    { "path"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterPath, setterPath } },
+    { "paths"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterPaths, setterPaths } },
+};
+
+class JSCommonJSModulePrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static JSCommonJSModulePrototype* create(
+        JSC::VM& vm,
+        JSC::JSGlobalObject* globalObject,
+        JSC::Structure* structure)
+    {
+        JSCommonJSModulePrototype* prototype = new (NotNull, JSC::allocateCell<JSCommonJSModulePrototype>(vm)) JSCommonJSModulePrototype(vm, structure);
+        prototype->finishCreation(vm, globalObject);
+        return prototype;
     }
 
     DECLARE_INFO;
-    template<typename, SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+
+    JSCommonJSModulePrototype(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : Base(vm, structure)
     {
-        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
-            return nullptr;
-        return WebCore::subspaceForImpl<JSCommonJSModule, WebCore::UseCustomHeapCellType::No>(
-            vm,
-            [](auto& spaces) { return spaces.m_clientSubspaceForCommonJSModuleRecord.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); },
-            [](auto& spaces) { return spaces.m_subspaceForCommonJSModuleRecord.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_subspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); });
     }
 
-    JSCommonJSModule(JSC::VM& vm, JSC::Structure* structure)
-        : Base(vm, structure)
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        return &vm.plainObjectSpace();
+    }
+
+    void finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
     {
+        Base::finishCreation(vm);
+        ASSERT(inherits(vm, info()));
+        reifyStaticProperties(vm, JSCommonJSModule::info(), JSCommonJSModulePrototypeTableValues, *this);
+
+        JSFunction* requireFunction = JSFunction::create(
+            vm,
+            moduleRequireCodeGenerator(vm),
+            globalObject->globalScope(),
+            JSFunction::createStructure(vm, globalObject, RequireFunctionPrototype::create(globalObject)));
+
+        this->putDirect(vm, clientData(vm)->builtinNames().requirePublicName(), requireFunction, PropertyAttribute::Builtin | PropertyAttribute::Function | 0);
+
+        this->putDirectNativeFunction(
+            vm,
+            globalObject,
+            clientData(vm)->builtinNames().requirePrivateName(),
+            2,
+            jsFunctionRequireCommonJS, ImplementationVisibility::Public, NoIntrinsic, JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::DontDelete | 0);
     }
 };
 
-Structure* createCommonJSModuleStructure(
-    Zig::GlobalObject* globalObject)
+const JSC::ClassInfo JSCommonJSModulePrototype::s_info = { "Module"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSCommonJSModulePrototype) };
+
+void JSCommonJSModule::finishCreation(JSC::VM& vm, JSC::JSString* id, JSC::JSString* filename, JSC::JSString* dirname, JSC::JSSourceCode* sourceCode)
 {
-    return JSCommonJSModule::createStructure(globalObject);
+    Base::finishCreation(vm);
+    ASSERT(inherits(vm, info()));
+    m_id.set(vm, this, id);
+    m_filename.set(vm, this, filename);
+    m_dirname.set(vm, this, dirname);
+    this->sourceCode.set(vm, this, sourceCode);
 }
 
-template<typename Visitor>
-void JSCommonJSModule::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+JSC::Structure* JSCommonJSModule::createStructure(
+    JSC::JSGlobalObject* globalObject)
 {
-    JSCommonJSModule* thisObject = jsCast<JSCommonJSModule*>(cell);
-    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
-    Base::visitChildren(thisObject, visitor);
-    visitor.append(thisObject->m_exportsObject);
-    visitor.append(thisObject->m_id);
-}
+    auto& vm = globalObject->vm();
 
-DEFINE_VISIT_CHILDREN(JSCommonJSModule);
-const JSC::ClassInfo JSCommonJSModule::s_info = { "Module"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSCommonJSModule) };
+    auto* prototype = JSCommonJSModulePrototype::create(vm, globalObject, JSCommonJSModulePrototype::createStructure(vm, globalObject, globalObject->objectPrototype()));
 
-static bool canPerformFastEnumeration(Structure* s)
+    // Do not set the number of inline properties on this structure
+    // there may be an off-by-one error in the Structure which causes `require.id` to become the require
+    return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info(), NonArray);
+}
+
+JSCommonJSModule* JSCommonJSModule::create(
+    JSC::VM& vm,
+    JSC::Structure* structure,
+    JSC::JSString* id,
+    JSC::JSString* filename,
+    JSC::JSString* dirname,
+    JSC::JSSourceCode* sourceCode)
 {
-    if (s->typeInfo().overridesGetOwnPropertySlot())
-        return false;
-    if (s->typeInfo().overridesAnyFormOfGetOwnPropertyNames())
-        return false;
-    if (hasIndexedProperties(s->indexingType()))
-        return false;
-    if (s->hasAnyKindOfGetterSetterProperties())
-        return false;
-    if (s->isUncacheableDictionary())
-        return false;
-    if (s->hasUnderscoreProtoPropertyExcludingOriginalProto())
-        return false;
-    return true;
+    JSCommonJSModule* cell = new (NotNull, JSC::allocateCell<JSCommonJSModule>(vm)) JSCommonJSModule(vm, structure);
+    cell->finishCreation(vm, id, filename, dirname, sourceCode);
+    return cell;
 }
 
-JSValue evaluateCommonJSModule(
-    Zig::GlobalObject* globalObject,
-    Ref<Zig::SourceProvider> sourceProvider,
-    const WTF::String& sourceURL,
-    ResolvedSource source)
+JSC_DEFINE_HOST_FUNCTION(jsFunctionCreateCommonJSModule, (JSGlobalObject * globalObject, CallFrame* callframe))
 {
     auto& vm = globalObject->vm();
 
-    auto throwScope = DECLARE_THROW_SCOPE(vm);
-    auto* requireMapKey = jsString(vm, sourceURL);
+    auto id = callframe->argument(0).toWTFString(globalObject);
+
+    JSValue object = callframe->argument(1);
+
+    return JSValue::encode(
+        JSCommonJSModule::create(
+            jsCast<Zig::GlobalObject*>(globalObject),
+            id,
+            object, callframe->argument(2).isBoolean() && callframe->argument(2).asBoolean()));
+}
 
-    JSC::JSObject* exportsObject = source.commonJSExportsLen < 64
-        ? JSC::constructEmptyObject(globalObject, globalObject->objectPrototype(), source.commonJSExportsLen)
-        : JSC::constructEmptyObject(globalObject, globalObject->objectPrototype());
-    auto index = sourceURL.reverseFind('/', sourceURL.length());
+JSCommonJSModule* JSCommonJSModule::create(
+    Zig::GlobalObject* globalObject,
+    const WTF::String& key,
+    JSValue exportsObject,
+    bool hasEvaluated)
+{
+    auto& vm = globalObject->vm();
+    JSString* requireMapKey = JSC::jsStringWithCache(vm, key);
+    auto index = key.reverseFind('/', key.length());
     JSString* dirname = jsEmptyString(vm);
     JSString* filename = requireMapKey;
     if (index != WTF::notFound) {
         dirname = JSC::jsSubstring(globalObject, requireMapKey, 0, index);
     }
 
-    globalObject->requireMap()->set(globalObject, requireMapKey, exportsObject);
-    auto* requireFunction = Zig::ImportMetaObject::createRequireFunction(vm, globalObject, sourceURL);
-
-    JSC::SourceCode inputSource(
-        WTFMove(sourceProvider));
-
-    auto* moduleObject = JSCommonJSModule::create(
+    auto* out = JSCommonJSModule::create(
         vm,
         globalObject->CommonJSModuleObjectStructure(),
-        exportsObject,
-        requireMapKey, filename, dirname, requireFunction);
+        requireMapKey, filename, dirname, nullptr);
 
-    if (UNLIKELY(throwScope.exception())) {
-        globalObject->requireMap()->remove(globalObject, requireMapKey);
-        RELEASE_AND_RETURN(throwScope, JSValue());
-    }
+    out->putDirect(vm, WebCore::clientData(vm)->builtinNames().exportsPublicName(), exportsObject, exportsObject.isCell() && exportsObject.isCallable() ? JSC::PropertyAttribute::Function | 0 : 0);
+    out->hasEvaluated = hasEvaluated;
+    return out;
+}
 
-    JSC::Structure* thisObjectStructure = globalObject->commonJSFunctionArgumentsStructure();
-    JSC::JSObject* thisObject = JSC::constructEmptyObject(
-        vm,
-        thisObjectStructure);
-    thisObject->putDirectOffset(
-        vm,
-        0,
-        moduleObject);
+void JSCommonJSModule::destroy(JSC::JSCell* cell)
+{
+    static_cast<JSCommonJSModule*>(cell)->JSCommonJSModule::~JSCommonJSModule();
+}
 
-    thisObject->putDirectOffset(
-        vm,
-        1,
-        exportsObject);
+JSCommonJSModule::~JSCommonJSModule()
+{
+}
 
-    thisObject->putDirectOffset(
-        vm,
-        2,
-        dirname);
+bool JSCommonJSModule::evaluate(
+    Zig::GlobalObject* globalObject,
+    const WTF::String& key,
+    const SyntheticSourceProvider::SyntheticSourceGenerator& generator)
+{
+    Vector<JSC::Identifier, 4> propertyNames;
+    JSC::MarkedArgumentBuffer arguments;
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    generator(globalObject, JSC::Identifier::fromString(vm, key), propertyNames, arguments);
+    RETURN_IF_EXCEPTION(throwScope, false);
+
+    bool needsPut = false;
+    auto getDefaultValue = [&]() -> JSValue {
+        size_t defaultValueIndex = propertyNames.find(vm.propertyNames->defaultKeyword);
+        auto cjsSymbol = Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s));
+
+        if (defaultValueIndex != notFound && propertyNames.contains(cjsSymbol)) {
+            JSValue current = arguments.at(defaultValueIndex);
+            needsPut = true;
+            return current;
+        }
 
-    thisObject->putDirectOffset(
-        vm,
-        3,
-        filename);
+        size_t count = propertyNames.size();
+        JSValue existingDefaultObject = this->getIfPropertyExists(globalObject, WebCore::clientData(vm)->builtinNames().exportsPublicName());
+        JSObject* defaultObject;
 
-    thisObject->putDirectOffset(
-        vm,
-        4,
-        requireFunction);
+        if (existingDefaultObject && existingDefaultObject.isObject()) {
+            defaultObject = jsCast<JSObject*>(existingDefaultObject);
+        } else {
+            defaultObject = JSC::constructEmptyObject(globalObject, globalObject->objectPrototype());
+            needsPut = true;
+        }
 
-    {
-        WTF::NakedPtr<Exception> exception;
-        globalObject->m_BunCommonJSModuleValue.set(vm, globalObject, thisObject);
-        JSC::evaluate(globalObject, inputSource, globalObject->globalThis(), exception);
-
-        if (exception.get()) {
-            throwScope.throwException(globalObject, exception->value());
-            exception.clear();
-            RELEASE_AND_RETURN(throwScope, JSValue());
+        for (size_t i = 0; i < count; ++i) {
+            auto prop = propertyNames[i];
+            unsigned attributes = 0;
+
+            JSValue value = arguments.at(i);
+
+            if (prop.isSymbol()) {
+                attributes |= JSC::PropertyAttribute::DontEnum;
+            }
+
+            if (value.isCell() && value.isCallable()) {
+                attributes |= JSC::PropertyAttribute::Function;
+            }
+
+            defaultObject->putDirect(vm, prop, value, attributes);
+        }
+
+        return defaultObject;
+    };
+
+    JSValue defaultValue = getDefaultValue();
+    if (needsPut) {
+        unsigned attributes = 0;
+
+        if (defaultValue.isCell() && defaultValue.isCallable()) {
+            attributes |= JSC::PropertyAttribute::Function;
         }
-    }
 
-    if (UNLIKELY(throwScope.exception())) {
-        globalObject->requireMap()->remove(globalObject, requireMapKey);
-        RELEASE_AND_RETURN(throwScope, JSValue());
+        this->putDirect(vm, WebCore::clientData(vm)->builtinNames().exportsPublicName(), defaultValue, attributes);
     }
 
-    JSValue result = moduleObject->exportsObject();
+    this->hasEvaluated = true;
+    RELEASE_AND_RETURN(throwScope, true);
+}
+
+void JSCommonJSModule::toSyntheticSource(JSC::JSGlobalObject* globalObject,
+    JSC::Identifier moduleKey,
+    Vector<JSC::Identifier, 4>& exportNames,
+    JSC::MarkedArgumentBuffer& exportValues)
+{
+    auto result = this->exportsObject();
+
+    auto& vm = globalObject->vm();
+
+    // This exists to tell ImportMetaObject.ts that this is a CommonJS module.
+    exportNames.append(Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s)));
+    exportValues.append(jsNumber(0));
 
-    // The developer can do something like:
+    // Bun's intepretation of the "__esModule" annotation:
     //
-    //   Object.defineProperty(module, 'exports', {get: getter})
+    //   - If a "default" export does not exist OR the __esModule annotation is not present, then we
+    //   set the default export to the exports object
     //
-    // In which case, the exports object is now a GetterSetter object.
+    //   - If a "default" export also exists, then we set the default export
+    //   to the value of it (matching Babel behavior)
     //
-    // We can't return a GetterSetter object to ESM code, so we need to call it.
-    if (!result.isEmpty() && (result.isGetterSetter() || result.isCustomGetterSetter())) {
-        auto* clientData = WebCore::clientData(vm);
-
-        // TODO: is there a faster way to call these getters? We shouldn't need to do a full property lookup.
-        //
-        // we use getIfPropertyExists just incase a pathological devleoper did:
-        //
-        //   - Object.defineProperty(module, 'exports', {get: getter})
-        //   - delete module.exports
-        //
-        if (result.isGetterSetter()) {
-            JSC::GetterSetter* getter = jsCast<JSC::GetterSetter*>(result);
-            result = getter->callGetter(globalObject, moduleObject);
-        } else {
-            result = moduleObject->getIfPropertyExists(globalObject, clientData->builtinNames().exportsPublicName());
+    // https://stackoverflow.com/questions/50943704/whats-the-purpose-of-object-definepropertyexports-esmodule-value-0
+    // https://github.com/nodejs/node/issues/40891
+    // https://github.com/evanw/bundler-esm-cjs-tests
+    // https://github.com/evanw/esbuild/issues/1591
+    // https://github.com/oven-sh/bun/issues/3383
+    //
+    // Note that this interpretation is slightly different
+    //
+    //    -  We do not ignore when "type": "module" or when the file
+    //       extension is ".mjs". Build tools determine that based on the
+    //       caller's behavior, but in a JS runtime, there is only one ModuleNamespaceObject.
+    //
+    //       It would be possible to match the behavior at runtime, but
+    //       it would need further engine changes which do not match the ES Module spec
+    //
+    //   -   We ignore the value of the annotation. We only look for the
+    //       existence of the value being set. This is for performance reasons, but also
+    //       this annotation is meant for tooling and the only usages of setting
+    //       it to something that does NOT evaluate to "true" I could find were in
+    //       unit tests of build tools. Happy to revisit this if users file an issue.
+    bool needsToAssignDefault = true;
+
+    if (result.isObject()) {
+        auto* exports = asObject(result);
+
+        auto* structure = exports->structure();
+        uint32_t size = structure->inlineSize() + structure->outOfLineSize();
+        exportNames.reserveCapacity(size + 2);
+        exportValues.ensureCapacity(size + 2);
+
+        auto catchScope = DECLARE_CATCH_SCOPE(vm);
+
+        Identifier esModuleMarker = builtinNames(vm).__esModulePublicName();
+        bool hasESModuleMarker = !this->ignoreESModuleAnnotation && exports->hasProperty(globalObject, esModuleMarker);
+        if (catchScope.exception()) {
+            catchScope.clearException();
         }
 
-        if (UNLIKELY(throwScope.exception())) {
-            // Unlike getters on properties of the exports object
-            // When the exports object itself is a getter and it throws
-            // There's not a lot we can do
-            // so we surface that error
-            globalObject->requireMap()->remove(globalObject, requireMapKey);
-            RELEASE_AND_RETURN(throwScope, JSValue());
+        if (hasESModuleMarker) {
+            if (canPerformFastEnumeration(structure)) {
+                exports->structure()->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
+                    auto key = entry.key();
+                    if (key->isSymbol() || entry.attributes() & PropertyAttribute::DontEnum || key == esModuleMarker)
+                        return true;
+
+                    needsToAssignDefault = needsToAssignDefault && key != vm.propertyNames->defaultKeyword;
+
+                    JSValue value = exports->getDirect(entry.offset());
+                    exportNames.append(Identifier::fromUid(vm, key));
+                    exportValues.append(value);
+                    return true;
+                });
+            } else {
+                JSC::PropertyNameArray properties(vm, JSC::PropertyNameMode::Strings, JSC::PrivateSymbolMode::Exclude);
+                exports->methodTable()->getOwnPropertyNames(exports, globalObject, properties, DontEnumPropertiesMode::Exclude);
+                if (catchScope.exception()) {
+                    catchScope.clearExceptionExceptTermination();
+                    return;
+                }
+
+                for (auto property : properties) {
+                    if (UNLIKELY(property.isEmpty() || property.isNull() || property == esModuleMarker || property.isPrivateName() || property.isSymbol()))
+                        continue;
+
+                    // ignore constructor
+                    if (property == vm.propertyNames->constructor)
+                        continue;
+
+                    JSC::PropertySlot slot(exports, PropertySlot::InternalMethodType::Get);
+                    if (!exports->getPropertySlot(globalObject, property, slot))
+                        continue;
+
+                    exportNames.append(property);
+
+                    JSValue getterResult = slot.getValue(globalObject, property);
+
+                    // If it throws, we keep them in the exports list, but mark it as undefined
+                    // This is consistent with what Node.js does.
+                    if (catchScope.exception()) {
+                        catchScope.clearException();
+                        getterResult = jsUndefined();
+                    }
+
+                    exportValues.append(getterResult);
+
+                    needsToAssignDefault = needsToAssignDefault && property != vm.propertyNames->defaultKeyword;
+                }
+            }
+
+        } else if (canPerformFastEnumeration(structure)) {
+            exports->structure()->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
+                auto key = entry.key();
+                if (key->isSymbol() || entry.attributes() & PropertyAttribute::DontEnum || key == vm.propertyNames->defaultKeyword)
+                    return true;
+
+                JSValue value = exports->getDirect(entry.offset());
+
+                exportNames.append(Identifier::fromUid(vm, key));
+                exportValues.append(value);
+                return true;
+            });
+        } else {
+            JSC::PropertyNameArray properties(vm, JSC::PropertyNameMode::Strings, JSC::PrivateSymbolMode::Exclude);
+            exports->methodTable()->getOwnPropertyNames(exports, globalObject, properties, DontEnumPropertiesMode::Exclude);
+            if (catchScope.exception()) {
+                catchScope.clearExceptionExceptTermination();
+                return;
+            }
+
+            for (auto property : properties) {
+                if (UNLIKELY(property.isEmpty() || property.isNull() || property == vm.propertyNames->defaultKeyword || property.isPrivateName() || property.isSymbol()))
+                    continue;
+
+                // ignore constructor
+                if (property == vm.propertyNames->constructor)
+                    continue;
+
+                JSC::PropertySlot slot(exports, PropertySlot::InternalMethodType::Get);
+                if (!exports->getPropertySlot(globalObject, property, slot))
+                    continue;
+
+                exportNames.append(property);
+
+                JSValue getterResult = slot.getValue(globalObject, property);
+
+                // If it throws, we keep them in the exports list, but mark it as undefined
+                // This is consistent with what Node.js does.
+                if (catchScope.exception()) {
+                    catchScope.clearException();
+                    getterResult = jsUndefined();
+                }
+
+                exportValues.append(getterResult);
+            }
         }
     }
 
-    globalObject->requireMap()->set(globalObject, requireMapKey, result);
+    if (needsToAssignDefault) {
+        exportNames.append(vm.propertyNames->defaultKeyword);
+        exportValues.append(result);
+    }
+}
+
+JSValue JSCommonJSModule::exportsObject()
+{
+    return this->get(globalObject(), JSC::PropertyName(clientData(vm())->builtinNames().exportsPublicName()));
+}
 
-    return result;
+JSValue JSCommonJSModule::id()
+{
+    return m_id.get();
+}
+
+bool JSCommonJSModule::put(
+    JSC::JSCell* cell,
+    JSC::JSGlobalObject* globalObject,
+    JSC::PropertyName propertyName,
+    JSC::JSValue value,
+    JSC::PutPropertySlot& slot)
+{
+
+    auto& vm = globalObject->vm();
+    auto* clientData = WebCore::clientData(vm);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    RELEASE_AND_RETURN(throwScope, Base::put(cell, globalObject, propertyName, value, slot));
+}
+
+template<typename, SubspaceAccess mode> JSC::GCClient::IsoSubspace* JSCommonJSModule::subspaceFor(JSC::VM& vm)
+{
+    if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+        return nullptr;
+    return WebCore::subspaceForImpl<JSCommonJSModule, WebCore::UseCustomHeapCellType::No>(
+        vm,
+        [](auto& spaces) { return spaces.m_clientSubspaceForCommonJSModuleRecord.get(); },
+        [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); },
+        [](auto& spaces) { return spaces.m_subspaceForCommonJSModuleRecord.get(); },
+        [](auto& spaces, auto&& space) { spaces.m_subspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); });
+}
+
+Structure* createCommonJSModuleStructure(
+    Zig::GlobalObject* globalObject)
+{
+    return JSCommonJSModule::createStructure(globalObject);
 }
 
-JSC::SourceCode createCommonJSModule(
+template<typename Visitor>
+void JSCommonJSModule::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    JSCommonJSModule* thisObject = jsCast<JSCommonJSModule*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    Base::visitChildren(thisObject, visitor);
+    visitor.append(thisObject->m_id);
+    visitor.append(thisObject->sourceCode);
+    visitor.append(thisObject->m_filename);
+    visitor.append(thisObject->m_dirname);
+    visitor.append(thisObject->m_paths);
+}
+
+DEFINE_VISIT_CHILDREN(JSCommonJSModule);
+const JSC::ClassInfo JSCommonJSModule::s_info = { "Module"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSCommonJSModule) };
+const JSC::ClassInfo RequireResolveFunctionPrototype::s_info = { "resolve"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(RequireResolveFunctionPrototype) };
+const JSC::ClassInfo RequireFunctionPrototype::s_info = { "require"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(RequireFunctionPrototype) };
+
+JSC_DEFINE_HOST_FUNCTION(jsFunctionRequireCommonJS, (JSGlobalObject * lexicalGlobalObject, CallFrame* callframe))
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(callframe->thisValue());
+    if (!thisObject)
+        return throwVMTypeError(globalObject, throwScope);
+
+    JSValue specifierValue = callframe->argument(0);
+    WTF::String specifier = specifierValue.toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+
+    // Special-case for "process" to just return the process object directly.
+    if (UNLIKELY(specifier == "process"_s || specifier == "node:process"_s)) {
+        jsDynamicCast<JSCommonJSModule*>(callframe->argument(1))->putDirect(vm, builtinNames(vm).exportsPublicName(), globalObject->processObject(), 0);
+        return JSValue::encode(globalObject->processObject());
+    }
+
+    WTF::String referrer = thisObject->id().toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+
+    BunString specifierStr = Bun::toString(specifier);
+    BunString referrerStr = Bun::toString(referrer);
+
+    JSValue fetchResult = Bun::fetchCommonJSModule(
+        globalObject,
+        jsDynamicCast<JSCommonJSModule*>(callframe->argument(1)),
+        specifierValue,
+        &specifierStr,
+        &referrerStr);
+
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(fetchResult));
+}
+
+void RequireResolveFunctionPrototype::finishCreation(JSC::VM& vm)
+{
+    Base::finishCreation(vm);
+    ASSERT(inherits(vm, info()));
+
+    reifyStaticProperties(vm, RequireResolveFunctionPrototype::info(), RequireResolveFunctionPrototypeValues, *this);
+    JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+}
+
+bool JSCommonJSModule::evaluate(
     Zig::GlobalObject* globalObject,
+    const WTF::String& key,
     ResolvedSource source)
 {
-    auto sourceURL = Zig::toStringCopy(source.source_url);
-    auto sourceProvider = Zig::SourceProvider::create(globalObject, source, JSC::SourceProviderSourceType::Program);
+    auto& vm = globalObject->vm();
+    auto sourceProvider = Zig::SourceProvider::create(jsCast<Zig::GlobalObject*>(globalObject), source, JSC::SourceProviderSourceType::Program);
+    this->ignoreESModuleAnnotation = source.tag == ResolvedSourceTagPackageJSONTypeModule;
+    JSC::SourceCode rawInputSource(
+        WTFMove(sourceProvider));
 
-    return JSC::SourceCode(
-        JSC::SyntheticSourceProvider::create(
-            [source, sourceProvider = WTFMove(sourceProvider), sourceURL](JSC::JSGlobalObject* globalObject,
-                JSC::Identifier moduleKey,
-                Vector<JSC::Identifier, 4>& exportNames,
-                JSC::MarkedArgumentBuffer& exportValues) -> void {
-                JSValue result = evaluateCommonJSModule(
-                    jsCast<Zig::GlobalObject*>(globalObject),
-                    WTFMove(sourceProvider),
-                    sourceURL,
-                    source);
+    if (this->hasEvaluated)
+        return true;
 
-                if (!result) {
-                    return;
-                }
+    this->sourceCode.set(vm, this, JSC::JSSourceCode::create(vm, WTFMove(rawInputSource)));
 
-                auto& vm = globalObject->vm();
+    WTF::NakedPtr<JSC::Exception> exception;
 
-                exportNames.append(vm.propertyNames->defaultKeyword);
-                exportValues.append(result);
-
-                // This exists to tell ImportMetaObject.ts that this is a CommonJS module.
-                exportNames.append(Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s)));
-                exportValues.append(jsNumber(0));
-
-                if (result.isObject()) {
-                    DeferGCForAWhile deferGC(vm);
-                    auto* exports = asObject(result);
-
-                    auto* structure = exports->structure();
-                    uint32_t size = structure->inlineSize() + structure->outOfLineSize();
-                    exportNames.reserveCapacity(size + 2);
-                    exportValues.ensureCapacity(size + 2);
-
-                    if (canPerformFastEnumeration(structure)) {
-                        exports->structure()->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
-                            auto key = entry.key();
-                            if (key->isSymbol() || key == vm.propertyNames->defaultKeyword || entry.attributes() & PropertyAttribute::DontEnum)
-                                return true;
-
-                            exportNames.append(Identifier::fromUid(vm, key));
-
-                            JSValue value = exports->getDirect(entry.offset());
-
-                            exportValues.append(value);
-                            return true;
-                        });
-                    } else {
-                        auto catchScope = DECLARE_CATCH_SCOPE(vm);
-                        JSC::PropertyNameArray properties(vm, JSC::PropertyNameMode::Strings, JSC::PrivateSymbolMode::Exclude);
-                        exports->methodTable()->getOwnPropertyNames(exports, globalObject, properties, DontEnumPropertiesMode::Exclude);
-                        if (catchScope.exception()) {
-                            catchScope.clearExceptionExceptTermination();
-                            return;
-                        }
+    evaluateCommonJSModuleOnce(vm, globalObject, this, this->m_dirname.get(), this->m_filename.get(), exception);
+
+    if (exception.get()) {
+        // On error, remove the module from the require map/
+        // so that it can be re-evaluated on the next require.
+        globalObject->requireMap()->remove(globalObject, this->id());
+
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        throwException(globalObject, throwScope, exception.get());
+        exception.clear();
+
+        return false;
+    }
+
+    return true;
+}
+
+std::optional<JSC::SourceCode> createCommonJSModule(
+    Zig::GlobalObject* globalObject,
+    ResolvedSource source)
+{
+    JSCommonJSModule* moduleObject;
+    WTF::String sourceURL = toStringCopy(source.source_url);
+
+    JSValue specifierValue = Bun::toJS(globalObject, source.specifier);
+    JSValue entry = globalObject->requireMap()->get(globalObject, specifierValue);
 
-                        for (auto property : properties) {
-                            if (UNLIKELY(property.isEmpty() || property.isNull() || property.isPrivateName() || property.isSymbol()))
-                                continue;
+    auto sourceProvider = Zig::SourceProvider::create(jsCast<Zig::GlobalObject*>(globalObject), source, JSC::SourceProviderSourceType::Program);
+    bool ignoreESModuleAnnotation = source.tag == ResolvedSourceTagPackageJSONTypeModule;
+    SourceOrigin sourceOrigin = sourceProvider->sourceOrigin();
 
-                            // ignore constructor
-                            if (property == vm.propertyNames->constructor || property == vm.propertyNames->defaultKeyword)
-                                continue;
+    if (entry) {
+        moduleObject = jsDynamicCast<JSCommonJSModule*>(entry);
+    }
+
+    if (!moduleObject) {
+        auto& vm = globalObject->vm();
+        auto* requireMapKey = jsStringWithCache(vm, sourceURL);
+        auto index = sourceURL.reverseFind('/', sourceURL.length());
+        JSString* dirname = jsEmptyString(vm);
+        JSString* filename = requireMapKey;
+        if (index != WTF::notFound) {
+            dirname = JSC::jsSubstring(globalObject, requireMapKey, 0, index);
+        }
 
-                            JSC::PropertySlot slot(exports, PropertySlot::InternalMethodType::Get);
-                            if (!exports->getPropertySlot(globalObject, property, slot))
-                                continue;
+        JSC::SourceCode rawInputSource(
+            WTFMove(sourceProvider));
 
-                            exportNames.append(property);
+        moduleObject = JSCommonJSModule::create(
+            vm,
+            globalObject->CommonJSModuleObjectStructure(),
+            requireMapKey, filename, dirname, JSC::JSSourceCode::create(vm, WTFMove(rawInputSource)));
 
-                            JSValue getterResult = slot.getValue(globalObject, property);
+        moduleObject->putDirect(vm,
+            WebCore::clientData(vm)->builtinNames().exportsPublicName(),
+            JSC::constructEmptyObject(globalObject, globalObject->objectPrototype()), 0);
 
-                            // If it throws, we keep them in the exports list, but mark it as undefined
-                            // This is consistent with what Node.js does.
-                            if (catchScope.exception()) {
-                                catchScope.clearException();
-                                getterResult = jsUndefined();
-                            }
+        globalObject->requireMap()->set(globalObject, requireMapKey, moduleObject);
+    }
+
+    moduleObject->ignoreESModuleAnnotation = ignoreESModuleAnnotation;
+
+    return JSC::SourceCode(
+        JSC::SyntheticSourceProvider::create(
+            [](JSC::JSGlobalObject* lexicalGlobalObject,
+                JSC::Identifier moduleKey,
+                Vector<JSC::Identifier, 4>& exportNames,
+                JSC::MarkedArgumentBuffer& exportValues) -> void {
+                auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+                auto& vm = globalObject->vm();
 
-                            exportValues.append(getterResult);
+                JSValue keyValue = identifierToJSValue(vm, moduleKey);
+                JSValue entry = globalObject->requireMap()->get(globalObject, keyValue);
+
+                if (entry) {
+                    if (auto* moduleObject = jsDynamicCast<JSCommonJSModule*>(entry)) {
+                        if (!moduleObject->hasEvaluated) {
+                            WTF::NakedPtr<JSC::Exception> exception;
+                            if (!evaluateCommonJSModuleOnce(
+                                    vm,
+                                    globalObject,
+                                    moduleObject,
+                                    moduleObject->m_dirname.get(),
+                                    moduleObject->m_filename.get(), exception)) {
+
+                                // On error, remove the module from the require map
+                                // so that it can be re-evaluated on the next require.
+                                globalObject->requireMap()->remove(globalObject, moduleObject->id());
+
+                                auto scope = DECLARE_THROW_SCOPE(vm);
+                                throwException(globalObject, scope, exception.get());
+                                exception.clear();
+                                return;
+                            }
                         }
+
+                        moduleObject->toSyntheticSource(globalObject, moduleKey, exportNames, exportValues);
                     }
                 }
             },
-            SourceOrigin(WTF::URL::fileURLWithFileSystemPath(sourceURL)),
+            sourceOrigin,
             sourceURL));
 }
-
 }
 \ No newline at end of file
diff --git a/src/bun.js/bindings/CommonJSModuleRecord.h b/src/bun.js/bindings/CommonJSModuleRecord.h
index 86daf875d..15792f9da 100644
--- a/src/bun.js/bindings/CommonJSModuleRecord.h
+++ b/src/bun.js/bindings/CommonJSModuleRecord.h
@@ -6,14 +6,92 @@ class GlobalObject;
 }
 namespace JSC {
 class SourceCode;
+class JSSourceCode;
+class ProgramExecutable;
+class AbstractModuleRecord;
 }
 
 namespace Bun {
 
+JSC_DECLARE_HOST_FUNCTION(jsFunctionCreateCommonJSModule);
+JSC_DECLARE_HOST_FUNCTION(jsFunctionLoadModule);
+
+class JSCommonJSModule final : public JSC::JSDestructibleObject {
+public:
+    using Base = JSC::JSDestructibleObject;
+    static constexpr unsigned StructureFlags = Base::StructureFlags | JSC::OverridesPut;
+
+    mutable JSC::WriteBarrier<JSC::JSString> m_id;
+    mutable JSC::WriteBarrier<JSC::JSString> m_filename;
+    mutable JSC::WriteBarrier<JSC::JSString> m_dirname;
+    mutable JSC::WriteBarrier<Unknown> m_paths;
+    mutable JSC::WriteBarrier<JSC::JSSourceCode> sourceCode;
+    bool ignoreESModuleAnnotation { false };
+
+    static void destroy(JSC::JSCell*);
+    ~JSCommonJSModule();
+
+    void finishCreation(JSC::VM& vm,
+        JSC::JSString* id, JSC::JSString* filename,
+        JSC::JSString* dirname, JSC::JSSourceCode* sourceCode);
+
+    static JSC::Structure* createStructure(JSC::JSGlobalObject* globalObject);
+
+    bool evaluate(Zig::GlobalObject* globalObject, const WTF::String& sourceURL, ResolvedSource resolvedSource);
+    bool evaluate(Zig::GlobalObject* globalObject, const WTF::String& key, const SyntheticSourceProvider::SyntheticSourceGenerator& generator);
+
+    static JSCommonJSModule* create(JSC::VM& vm, JSC::Structure* structure,
+        JSC::JSString* id,
+        JSC::JSString* filename,
+        JSC::JSString* dirname, JSC::JSSourceCode* sourceCode);
+
+    static JSCommonJSModule* create(
+        Zig::GlobalObject* globalObject,
+        const WTF::String& key,
+        JSValue exportsObject,
+        bool hasEvaluated = false);
+
+    static JSCommonJSModule* create(
+        Zig::GlobalObject* globalObject,
+        const WTF::String& key,
+        ResolvedSource resolvedSource);
+
+    void toSyntheticSource(JSC::JSGlobalObject* globalObject,
+        JSC::Identifier moduleKey,
+        Vector<JSC::Identifier, 4>& exportNames,
+        JSC::MarkedArgumentBuffer& exportValues);
+
+    JSValue exportsObject();
+    JSValue id();
+
+    DECLARE_VISIT_CHILDREN;
+
+    static bool put(JSC::JSCell* cell, JSC::JSGlobalObject* globalObject,
+        JSC::PropertyName propertyName, JSC::JSValue value,
+        JSC::PutPropertySlot& slot);
+
+    DECLARE_INFO;
+    template<typename, SubspaceAccess mode>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm);
+
+    bool hasEvaluated = false;
+
+    JSCommonJSModule(JSC::VM& vm, JSC::Structure* structure)
+        : Base(vm, structure)
+    {
+    }
+};
+
+JSCommonJSModule* createCommonJSModuleWithoutRunning(
+    Zig::GlobalObject* globalObject,
+    Ref<Zig::SourceProvider> sourceProvider,
+    const WTF::String& sourceURL,
+    ResolvedSource source);
+
 JSC::Structure* createCommonJSModuleStructure(
     Zig::GlobalObject* globalObject);
 
-JSC::SourceCode createCommonJSModule(
+std::optional<JSC::SourceCode> createCommonJSModule(
     Zig::GlobalObject* globalObject,
     ResolvedSource source);
 
diff --git a/src/bun.js/bindings/FFI.zig b/src/bun.js/bindings/FFI.zig
index 087d8308c..fde4a8d30 100644
--- a/src/bun.js/bindings/FFI.zig
+++ b/src/bun.js/bindings/FFI.zig
@@ -42,7 +42,7 @@ pub inline fn JSVALUE_TO_UINT64(arg_value: EncodedJSValue) u64 {
         return @bitCast(u64, @as(c_longlong, JSVALUE_TO_INT32(value)));
     }
     if (JSVALUE_IS_NUMBER(value)) {
-        return @floatToInt(u64, JSVALUE_TO_DOUBLE(value));
+        return @intFromFloat(u64, JSVALUE_TO_DOUBLE(value));
     }
     return JSVALUE_TO_UINT64_SLOW(value);
 }
@@ -52,7 +52,7 @@ pub inline fn JSVALUE_TO_INT64(arg_value: EncodedJSValue) i64 {
         return @bitCast(i64, @as(c_longlong, JSVALUE_TO_INT32(value)));
     }
     if (JSVALUE_IS_NUMBER(value)) {
-        return @floatToInt(i64, JSVALUE_TO_DOUBLE(value));
+        return @intFromFloat(i64, JSVALUE_TO_DOUBLE(value));
     }
     return JSVALUE_TO_INT64_SLOW(value);
 }
@@ -67,7 +67,7 @@ pub inline fn UINT64_TO_JSVALUE(arg_globalObject: ?*anyopaque, arg_val: u64) Enc
         return INT32_TO_JSVALUE(@bitCast(i32, @truncate(c_uint, val)));
     }
     if (val < @bitCast(c_ulonglong, @as(c_longlong, @as(c_long, 9007199254740991)))) {
-        return DOUBLE_TO_JSVALUE(@intToFloat(f64, val));
+        return DOUBLE_TO_JSVALUE(@floatFromInt(f64, val));
     }
     return UINT64_TO_JSVALUE_SLOW(@ptrCast(*@import("./bindings.zig").JSGlobalObject, globalObject.?), val).asEncoded();
 }
@@ -78,7 +78,7 @@ pub inline fn INT64_TO_JSVALUE(arg_globalObject: ?*anyopaque, arg_val: i64) Enco
         return INT32_TO_JSVALUE(@bitCast(i32, @truncate(c_int, val)));
     }
     if ((val >= @bitCast(c_longlong, @as(c_longlong, -@as(c_long, 9007199254740991)))) and (val <= @bitCast(c_longlong, @as(c_longlong, @as(c_long, 9007199254740991))))) {
-        return DOUBLE_TO_JSVALUE(@intToFloat(f64, val));
+        return DOUBLE_TO_JSVALUE(@floatFromInt(f64, val));
     }
     return INT64_TO_JSVALUE_SLOW(@ptrCast(*@import("./bindings.zig").JSGlobalObject, globalObject.?), val).asEncoded();
 }
@@ -97,18 +97,18 @@ pub inline fn FLOAT_TO_JSVALUE(arg_val: f32) EncodedJSValue {
 pub inline fn BOOLEAN_TO_JSVALUE(arg_val: @"bool") EncodedJSValue {
     var val = arg_val;
     var res: EncodedJSValue = undefined;
-    res.asInt64 = @bitCast(i64, @as(c_longlong, if (@as(c_int, @boolToInt(val)) != 0) (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 1) else (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 0)));
+    res.asInt64 = @bitCast(i64, @as(c_longlong, if (@as(c_int, @intFromBool(val)) != 0) (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 1) else (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 0)));
     return res;
 }
 pub inline fn PTR_TO_JSVALUE(arg_ptr: ?*anyopaque) EncodedJSValue {
     var ptr = arg_ptr;
     var val: EncodedJSValue = undefined;
-    val.asInt64 = @intCast(i64, @ptrToInt(ptr)) + (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49));
+    val.asInt64 = @intCast(i64, @intFromPtr(ptr)) + (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49));
     return val;
 }
 pub inline fn JSVALUE_TO_PTR(arg_val: EncodedJSValue) ?*anyopaque {
     var val = arg_val;
-    return @intToPtr(?*anyopaque, val.asInt64 - (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49)));
+    return @ptrFromInt(?*anyopaque, val.asInt64 - (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49)));
 }
 pub inline fn JSVALUE_TO_INT32(arg_val: EncodedJSValue) i32 {
     var val = arg_val;
diff --git a/src/bun.js/bindings/ImportMetaObject.cpp b/src/bun.js/bindings/ImportMetaObject.cpp
index a53712823..037305c81 100644
--- a/src/bun.js/bindings/ImportMetaObject.cpp
+++ b/src/bun.js/bindings/ImportMetaObject.cpp
@@ -38,6 +38,9 @@
 #include "JSDOMURL.h"
 #include "JavaScriptCore/JSNativeStdFunction.h"
 #include "JavaScriptCore/GetterSetter.h"
+#include <JavaScriptCore/LazyProperty.h>
+#include <JavaScriptCore/LazyPropertyInlines.h>
+#include <JavaScriptCore/VMTrapsInlines.h>
 
 namespace Zig {
 using namespace JSC;
@@ -56,6 +59,7 @@ static EncodedJSValue functionRequireResolve(JSC::JSGlobalObject* globalObject,
         return JSC::JSValue::encode(JSC::JSValue {});
     }
     default: {
+        JSValue thisValue = callFrame->thisValue();
         JSC::JSValue moduleName = callFrame->argument(0);
 
         auto doIt = [&](const WTF::String& fromStr) -> JSC::EncodedJSValue {
@@ -83,10 +87,12 @@ static EncodedJSValue functionRequireResolve(JSC::JSGlobalObject* globalObject,
             // require.resolve also supports a paths array
             // we only support a single path
             if (!fromValue.isUndefinedOrNull() && fromValue.isObject()) {
-                if (JSValue pathsValue = fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s))) {
-                    if (JSC::JSArray* array = JSC::jsDynamicCast<JSC::JSArray*>(pathsValue)) {
-                        if (array->length() > 0) {
-                            fromValue = array->getIndex(globalObject, 0);
+                if (auto pathsObject = fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s))) {
+                    if (pathsObject.isCell() && pathsObject.asCell()->type() == JSC::JSType::ArrayType) {
+                        auto pathsArray = JSC::jsCast<JSC::JSArray*>(pathsObject);
+                        if (pathsArray->length() > 0) {
+                            fromValue = pathsArray->getIndex(globalObject, 0);
+                            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
                         }
                     }
                 }
@@ -123,216 +129,265 @@ Zig::ImportMetaObject* Zig::ImportMetaObject::create(JSC::JSGlobalObject* global
 }
 
 JSC_DECLARE_HOST_FUNCTION(jsFunctionRequireResolve);
+JSC_DEFINE_HOST_FUNCTION(jsFunctionRequireResolve, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    JSValue thisValue = callFrame->thisValue();
+    WTF::String fromStr;
 
-class JSRequireResolveFunctionPrototype final : public JSC::InternalFunction {
-public:
-    using Base = JSC::InternalFunction;
-
-    static JSRequireResolveFunctionPrototype* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
-    {
-        auto* structure = createStructure(vm, globalObject, globalObject->functionPrototype());
-        JSRequireResolveFunctionPrototype* function = new (NotNull, JSC::allocateCell<JSRequireResolveFunctionPrototype>(vm)) JSRequireResolveFunctionPrototype(vm, structure);
-        function->finishCreation(vm);
-        return function;
+    if (thisValue.isString()) {
+        fromStr = thisValue.toWTFString(globalObject);
     }
 
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
-    {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::InternalFunctionType, StructureFlags), info());
-    }
+    return functionRequireResolve(globalObject, callFrame, fromStr);
+}
 
-    DECLARE_INFO;
+JSC_DEFINE_CUSTOM_GETTER(jsRequireCacheGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    Zig::GlobalObject* thisObject = jsCast<Zig::GlobalObject*>(globalObject);
+    return JSValue::encode(thisObject->lazyRequireCacheObject());
+}
+
+JSC_DEFINE_CUSTOM_SETTER(jsRequireCacheSetter,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSObject* thisObject = jsDynamicCast<JSObject*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->putDirect(globalObject->vm(), propertyName, JSValue::decode(value), 0);
+    return true;
+}
 
-    static JSC::EncodedJSValue pathsFunction(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+JSC_DEFINE_HOST_FUNCTION(requireResolvePathsFunction, (JSGlobalObject * globalObject, CallFrame* callframe))
+{
+    return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr, 0));
+}
+
+static const HashTableValue RequireResolveFunctionPrototypeValues[] = {
+    { "paths"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, requireResolvePathsFunction, 1 } },
+};
+
+class RequireResolveFunctionPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static RequireResolveFunctionPrototype* create(
+        JSC::JSGlobalObject* globalObject)
     {
-        return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr));
+        auto& vm = globalObject->vm();
+
+        auto* structure = RequireResolveFunctionPrototype::createStructure(vm, globalObject, globalObject->functionPrototype());
+        RequireResolveFunctionPrototype* prototype = new (NotNull, JSC::allocateCell<RequireResolveFunctionPrototype>(vm)) RequireResolveFunctionPrototype(vm, structure);
+        prototype->finishCreation(vm);
+        return prototype;
     }
 
-private:
-    JSRequireResolveFunctionPrototype(JSC::VM& vm, JSC::Structure* structure)
-        : JSC::InternalFunction(vm, structure, jsFunctionRequireResolve, jsFunctionRequireResolve)
+    DECLARE_INFO;
 
+    RequireResolveFunctionPrototype(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : Base(vm, structure)
     {
     }
 
-    void finishCreation(JSC::VM& vm)
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
-        this->putDirectNativeFunction(vm, globalObject(), Identifier::fromString(vm, "paths"_s), 0, pathsFunction, ImplementationVisibility::Public, NoIntrinsic, 0);
-        Base::finishCreation(vm, 2, "resolve"_s, PropertyAdditionMode::WithoutStructureTransition);
+        return &vm.plainObjectSpace();
     }
 };
 
-const JSC::ClassInfo JSRequireResolveFunctionPrototype::s_info = { "Function"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSRequireResolveFunctionPrototype) };
+class ResolveFunction final : public JSC::InternalFunction {
 
-class JSRequireResolveFunction final : public JSC::InternalFunction {
 public:
     using Base = JSC::InternalFunction;
-
-    static JSRequireResolveFunction* create(JSC::VM& vm, JSC::Structure* structure, const WTF::String& from)
-    {
-        JSRequireResolveFunction* function = new (NotNull, JSC::allocateCell<JSRequireResolveFunction>(vm)) JSRequireResolveFunction(vm, structure, from);
-        function->finishCreation(vm);
-        return function;
-    }
-
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+    static ResolveFunction* create(JSGlobalObject* globalObject)
     {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::InternalFunctionType, StructureFlags), info());
+        JSObject* resolvePrototype = RequireResolveFunctionPrototype::create(globalObject);
+        Structure* structure = Structure::create(
+            globalObject->vm(),
+            globalObject,
+            resolvePrototype,
+            JSC::TypeInfo(JSC::InternalFunctionType, StructureFlags),
+            ResolveFunction::info());
+        auto* resolveFunction = new (NotNull, JSC::allocateCell<ResolveFunction>(globalObject->vm())) ResolveFunction(globalObject->vm(), structure);
+        resolveFunction->finishCreation(globalObject->vm());
+        return resolveFunction;
     }
 
     DECLARE_INFO;
 
-    WTF::String from;
-
-    template<typename, JSC::SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
-    {
-        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
-            return nullptr;
-
-        return WebCore::subspaceForImpl<JSRequireResolveFunction, UseCustomHeapCellType::No>(
-            vm,
-            [](auto& spaces) { return spaces.m_clientSubspaceForRequireResolveFunction.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForRequireResolveFunction = std::forward<decltype(space)>(space); },
-            [](auto& spaces) { return spaces.m_subspaceForRequireResolveFunction.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_subspaceForRequireResolveFunction = std::forward<decltype(space)>(space); });
-    }
-
-private:
-    JSRequireResolveFunction(JSC::VM& vm, JSC::Structure* structure, const WTF::String& from_)
-        : JSC::InternalFunction(vm, structure, jsFunctionRequireResolve, jsFunctionRequireResolve)
-        , from(from_)
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
+        return &vm.internalFunctionSpace();
     }
 
-    void finishCreation(JSC::VM& vm)
+    ResolveFunction(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : InternalFunction(vm, structure, jsFunctionRequireResolve, nullptr)
     {
-        Base::finishCreation(vm);
     }
 };
 
-const JSC::ClassInfo JSRequireResolveFunction::s_info = { "Function"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSRequireResolveFunction) };
-
-JSC_DEFINE_HOST_FUNCTION(jsFunctionRequireResolve, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+JSObject* Zig::ImportMetaObject::createRequireResolveFunctionUnbound(VM& vm, JSGlobalObject* globalObject)
 {
-    JSRequireResolveFunction* thisObject = JSC::jsCast<JSRequireResolveFunction*>(callFrame->jsCallee());
-    return functionRequireResolve(globalObject, callFrame, thisObject->from);
+    return ResolveFunction::create(globalObject);
 }
 
-JSValue Zig::ImportMetaObject::createResolveFunctionPrototype(JSC::VM& vm, Zig::GlobalObject* globalObject)
+JSObject* Zig::ImportMetaObject::createRequireFunctionUnbound(VM& vm, JSGlobalObject* globalObject)
 {
-    return JSRequireResolveFunctionPrototype::create(vm, globalObject);
-}
+    auto& builtinNames = WebCore::builtinNames(vm);
 
-JSC::Structure* Zig::ImportMetaObject::createResolveFunctionStructure(JSC::VM& vm, Zig::GlobalObject* globalObject)
-{
-    JSValue prototype = globalObject->requireResolveFunctionPrototype();
-    return JSRequireResolveFunction::createStructure(vm, globalObject, prototype);
-}
+    JSC::JSFunction* requireDotMainFunction = JSFunction::create(
+        vm,
+        moduleMainCodeGenerator(vm),
+        globalObject->globalScope());
 
-JSC_DEFINE_CUSTOM_GETTER(jsRequireCacheGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
-{
-    Zig::GlobalObject* thisObject = jsCast<Zig::GlobalObject*>(globalObject);
-    return JSValue::encode(thisObject->lazyRequireCacheObject());
+    auto* prototype = JSC::constructEmptyObject(globalObject, globalObject->functionPrototype());
+    prototype->putDirect(
+        vm,
+        JSC::Identifier::fromString(vm, "main"_s),
+        JSC::GetterSetter::create(vm, globalObject, requireDotMainFunction, JSValue()),
+        PropertyAttribute::Builtin | PropertyAttribute::Accessor | PropertyAttribute::ReadOnly | 0);
+    prototype->putDirect(vm, JSC::Identifier::fromString(vm, "extensions"_s), constructEmptyObject(globalObject), 0);
+    prototype->putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "cache"_s), JSC::CustomGetterSetter::create(vm, Zig::jsRequireCacheGetter, Zig::jsRequireCacheSetter), 0);
+    return JSFunction::create(vm, importMetaObjectRequireCodeGenerator(vm), globalObject, JSFunction::createStructure(vm, globalObject, prototype));
 }
 
-JSC_DEFINE_CUSTOM_SETTER(jsRequireCacheSetter,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+JSObject* Zig::ImportMetaObject::createRequireFunction(VM& vm, JSGlobalObject* lexicalGlobalObject, const WTF::String& pathString)
 {
-    JSObject* thisObject = jsDynamicCast<JSObject*>(JSValue::decode(thisValue));
-    if (!thisObject)
-        return false;
+    auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto& builtinNames = WebCore::builtinNames(vm);
 
-    thisObject->putDirect(globalObject->vm(), propertyName, JSValue::decode(value), 0);
-    return true;
-}
+    JSFunction* resolveFunctionUnbound = jsCast<JSFunction*>(globalObject->importMetaRequireResolveFunctionUnbound());
+    JSFunction* requireFunctionUnbound = jsCast<JSFunction*>(globalObject->importMetaRequireFunctionUnbound());
+    auto str = jsString(vm, pathString);
+
+    JSFunction* requireFunction = JSC::JSBoundFunction::create(vm,
+        globalObject, requireFunctionUnbound,
+        str, ArgList(), 1, jsString(vm, String("require"_s)));
+
+    JSFunction* resolveFunction = JSC::JSBoundFunction::create(vm,
+        globalObject, resolveFunctionUnbound,
+        str, ArgList(), 2, jsString(vm, String("resolve"_s)));
+
+    requireFunction->putDirect(vm, builtinNames.resolvePublicName(), resolveFunction, PropertyAttribute::Function | 0);
 
-JSObject* Zig::ImportMetaObject::createRequireFunction(VM& vm, JSGlobalObject* lexicalGlobalObject, const WTF::String& pathString)
-{
-    Zig::GlobalObject* globalObject = static_cast<Zig::GlobalObject*>(lexicalGlobalObject);
-    JSFunction* requireFunction = JSFunction::create(vm, importMetaObjectRequireCodeGenerator(vm), globalObject);
-    auto* resolveFunction = JSRequireResolveFunction::create(vm, globalObject->requireResolveFunctionStructure(), pathString);
-    auto clientData = WebCore::clientData(vm);
-    requireFunction->putDirect(vm, clientData->builtinNames().pathPublicName(), jsString(vm, pathString), PropertyAttribute::DontEnum | 0);
-    requireFunction->putDirect(vm, clientData->builtinNames().resolvePublicName(), resolveFunction, PropertyAttribute::Function | PropertyAttribute::DontDelete | 0);
-    requireFunction->putDirectCustomAccessor(vm, Identifier::fromString(vm, "cache"_s), JSC::CustomGetterSetter::create(vm, jsRequireCacheGetter, jsRequireCacheSetter), 0);
     return requireFunction;
 }
 
+const JSC::ClassInfo RequireResolveFunctionPrototype::s_info = { "resolve"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(RequireResolveFunctionPrototype) };
+const JSC::ClassInfo ResolveFunction::s_info = { "resolve"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(ResolveFunction) };
+
 extern "C" EncodedJSValue functionImportMeta__resolveSync(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
 {
     JSC::VM& vm = globalObject->vm();
     auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
 
-    switch (callFrame->argumentCount()) {
-    case 0: {
+    JSValue thisValue = callFrame->thisValue();
+    JSC::JSValue moduleName = callFrame->argument(0);
+    JSC::JSValue fromValue = callFrame->argument(1);
 
-        // not "requires" because "require" could be confusing
-        JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync needs 1 argument (a string)"_s);
+    if (moduleName.isUndefinedOrNull()) {
+        JSC::throwTypeError(globalObject, scope, "expects a string"_s);
         scope.release();
         return JSC::JSValue::encode(JSC::JSValue {});
     }
-    default: {
-        JSC::JSValue moduleName = callFrame->argument(0);
 
-        if (moduleName.isUndefinedOrNull()) {
-            JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync expects a string"_s);
-            scope.release();
-            return JSC::JSValue::encode(JSC::JSValue {});
-        }
+    JSC__JSValue from;
+    bool isESM = true;
 
-        JSC__JSValue from;
-        bool isESM = true;
+    if (callFrame->argumentCount() > 1) {
 
-        if (callFrame->argumentCount() > 1) {
-            JSC::JSValue fromValue = callFrame->argument(1);
+        if (callFrame->argumentCount() > 2) {
+            JSC::JSValue isESMValue = callFrame->argument(2);
+            if (isESMValue.isBoolean()) {
+                isESM = isESMValue.toBoolean(globalObject);
+                RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
+            }
+        }
 
-            // require.resolve also supports a paths array
-            // we only support a single path
-            if (!fromValue.isUndefinedOrNull() && fromValue.isObject()) {
-                if (JSC::JSArray* array = JSC::jsDynamicCast<JSC::JSArray*>(fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s)))) {
-                    if (array->length() > 0) {
-                        fromValue = array->getIndex(globalObject, 0);
-                    }
-                }
+        if (!fromValue.isUndefinedOrNull() && fromValue.isObject()) {
 
-                if (callFrame->argumentCount() > 2) {
-                    JSC::JSValue isESMValue = callFrame->argument(2);
-                    if (isESMValue.isBoolean()) {
-                        isESM = isESMValue.toBoolean(globalObject);
+            if (auto pathsObject = fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s))) {
+                if (pathsObject.isCell() && pathsObject.asCell()->type() == JSC::JSType::ArrayType) {
+                    auto pathsArray = JSC::jsCast<JSC::JSArray*>(pathsObject);
+                    if (pathsArray->length() > 0) {
+                        fromValue = pathsArray->getIndex(globalObject, 0);
                         RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
                     }
                 }
-            } else if (fromValue.isBoolean()) {
-                isESM = fromValue.toBoolean(globalObject);
-                RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
-            }
-            from = JSC::JSValue::encode(fromValue);
-
-        } else {
-            JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(callFrame->thisValue());
-            if (UNLIKELY(!thisObject)) {
-                auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-                JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync must be bound to an import.meta object"_s);
-                return JSC::JSValue::encode(JSC::JSValue {});
             }
 
-            auto clientData = WebCore::clientData(vm);
-
-            from = JSC::JSValue::encode(thisObject->get(globalObject, clientData->builtinNames().pathPublicName()));
+        } else if (fromValue.isBoolean()) {
+            isESM = fromValue.toBoolean(globalObject);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
+            fromValue = JSC::jsUndefined();
         }
 
-        auto result = Bun__resolveSync(globalObject, JSC::JSValue::encode(moduleName), from, isESM);
+        if (fromValue.isString()) {
+            from = JSC::JSValue::encode(fromValue);
+        } else if (thisValue.isString()) {
+            from = JSC::JSValue::encode(thisValue);
+        }
 
-        if (!JSC::JSValue::decode(result).isString()) {
-            JSC::throwException(globalObject, scope, JSC::JSValue::decode(result));
+    } else if (thisValue.isString()) {
+        from = JSC::JSValue::encode(thisValue);
+    } else {
+        JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(thisValue);
+        if (UNLIKELY(!thisObject)) {
+            auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+            JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync must be bound to an import.meta object"_s);
             return JSC::JSValue::encode(JSC::JSValue {});
         }
 
+        auto clientData = WebCore::clientData(vm);
+        JSValue pathProperty = thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().pathPublicName());
+
+        if (pathProperty && pathProperty.isString())
+            from = JSC::JSValue::encode(pathProperty);
+    }
+
+    auto result = Bun__resolveSync(globalObject, JSC::JSValue::encode(moduleName), from, isESM);
+
+    if (!JSC::JSValue::decode(result).isString()) {
+        JSC::throwException(globalObject, scope, JSC::JSValue::decode(result));
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
+
+    scope.release();
+    return result;
+}
+
+extern "C" EncodedJSValue functionImportMeta__resolveSyncPrivate(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    JSC::VM& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+
+    JSC::JSValue moduleName = callFrame->argument(0);
+    JSValue from = callFrame->argument(1);
+    bool isESM = callFrame->argument(2).asBoolean();
+
+    if (moduleName.isUndefinedOrNull()) {
+        JSC::throwTypeError(globalObject, scope, "expected module name as a string"_s);
         scope.release();
-        return result;
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
+
+    RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
+
+    auto result = Bun__resolveSync(globalObject, JSC::JSValue::encode(moduleName), JSValue::encode(from), isESM);
+
+    if (!JSC::JSValue::decode(result).isString()) {
+        JSC::throwException(globalObject, scope, JSC::JSValue::decode(result));
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
+
+    scope.release();
+    return result;
 }
 
 JSC_DECLARE_HOST_FUNCTION(functionImportMeta__resolve);
@@ -362,7 +417,7 @@ JSC_DEFINE_HOST_FUNCTION(functionImportMeta__resolve,
 
         JSC__JSValue from;
 
-        if (callFrame->argumentCount() > 1) {
+        if (callFrame->argumentCount() > 1 && callFrame->argument(1).isString()) {
             from = JSC::JSValue::encode(callFrame->argument(1));
         } else {
             JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(callFrame->thisValue());
@@ -374,7 +429,7 @@ JSC_DEFINE_HOST_FUNCTION(functionImportMeta__resolve,
 
             auto clientData = WebCore::clientData(vm);
 
-            from = JSC::JSValue::encode(thisObject->get(globalObject, clientData->builtinNames().pathPublicName()));
+            from = JSC::JSValue::encode(thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().pathPublicName()));
         }
 
         return Bun__resolve(globalObject, JSC::JSValue::encode(moduleName), from, true);
@@ -382,89 +437,244 @@ JSC_DEFINE_HOST_FUNCTION(functionImportMeta__resolve,
     }
 }
 
+enum class ImportMetaPropertyOffset : uint32_t {
+    url,
+    dir,
+    file,
+    path,
+    require,
+
+};
+static constexpr uint32_t numberOfImportMetaProperties = 5;
+
+Zig::ImportMetaObject* ImportMetaObject::create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, const WTF::String& url)
+{
+    ImportMetaObject* ptr = new (NotNull, JSC::allocateCell<ImportMetaObject>(vm)) ImportMetaObject(vm, structure, url);
+    ptr->finishCreation(vm);
+    return ptr;
+}
+Zig::ImportMetaObject* ImportMetaObject::create(JSC::JSGlobalObject* jslobalObject, JSC::JSString* keyString)
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(jslobalObject);
+    auto& vm = globalObject->vm();
+    auto view = keyString->value(globalObject);
+    JSC::Structure* structure = globalObject->ImportMetaObjectStructure();
+    return Zig::ImportMetaObject::create(vm, globalObject, structure, view);
+}
+
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_url, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->urlProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_dir, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->dirProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_file, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->fileProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_path, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->pathProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_require, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->requireProperty.getInitializedOnMainThread(thisObject));
+}
+
+static const HashTableValue ImportMetaObjectPrototypeValues[] = {
+    { "resolve"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, functionImportMeta__resolve, 0 } },
+    { "resolveSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, functionImportMeta__resolveSync, 0 } },
+    { "url"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_url, 0 } },
+    { "dir"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_dir, 0 } },
+    { "file"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_file, 0 } },
+    { "path"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_path, 0 } },
+    { "require"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_require, 0 } },
+};
+
 class ImportMetaObjectPrototype final : public JSC::JSNonFinalObject {
 public:
+    DECLARE_INFO;
     using Base = JSC::JSNonFinalObject;
 
-    static ImportMetaObjectPrototype* create(JSC::VM& vm, JSGlobalObject* globalObject, JSC::Structure* structure)
+    static Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
     {
-        ImportMetaObjectPrototype* ptr = new (NotNull, JSC::allocateCell<ImportMetaObjectPrototype>(vm)) ImportMetaObjectPrototype(vm, globalObject, structure);
-        ptr->finishCreation(vm, globalObject);
-        return ptr;
+        return Structure::create(vm, globalObject, globalObject->objectPrototype(), TypeInfo(ObjectType, StructureFlags), info());
+    }
+
+    static ImportMetaObjectPrototype* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+    {
+        ImportMetaObjectPrototype* prototype = new (NotNull, JSC::allocateCell<ImportMetaObjectPrototype>(vm)) ImportMetaObjectPrototype(vm, structure);
+        prototype->finishCreation(vm, globalObject);
+        return prototype;
     }
 
-    DECLARE_INFO;
     template<typename CellType, JSC::SubspaceAccess>
     static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
         return &vm.plainObjectSpace();
     }
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+
+    void finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
     {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
+        Base::finishCreation(vm);
+
+        auto* clientData = WebCore::clientData(vm);
+        auto& builtinNames = clientData->builtinNames();
+
+        reifyStaticProperties(vm, ImportMetaObject::info(), ImportMetaObjectPrototypeValues, *this);
+        JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+
+        this->putDirect(
+            vm,
+            builtinNames.mainPublicName(),
+            GetterSetter::create(vm, globalObject, JSFunction::create(vm, importMetaObjectMainCodeGenerator(vm), globalObject), nullptr),
+            JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Accessor | JSC::PropertyAttribute::Builtin | 0);
     }
 
-private:
-    ImportMetaObjectPrototype(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+    ImportMetaObjectPrototype(JSC::VM& vm, JSC::Structure* structure)
         : Base(vm, structure)
     {
     }
+};
+
+const ClassInfo ImportMetaObjectPrototype::s_info = {
+    "ImportMeta"_s,
 
-    void finishCreation(JSC::VM&, JSC::JSGlobalObject*);
+    Base::info(), nullptr, nullptr, CREATE_METHOD_TABLE(ImportMetaObjectPrototype)
 };
-STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(ImportMetaObjectPrototype, ImportMetaObjectPrototype::Base);
 
-JSObject* ImportMetaObject::createPrototype(VM& vm, JSDOMGlobalObject& globalObject)
+JSC::Structure* ImportMetaObject::createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
 {
-    return ImportMetaObjectPrototype::create(vm, &globalObject, ImportMetaObjectPrototype::createStructure(vm, &globalObject, globalObject.objectPrototype()));
-}
+    ImportMetaObjectPrototype* prototype = ImportMetaObjectPrototype::create(vm,
+        globalObject,
+        ImportMetaObjectPrototype::createStructure(vm, globalObject));
 
-void ImportMetaObjectPrototype::finishCreation(VM& vm, JSGlobalObject* globalObject_)
-{
-    Base::finishCreation(vm);
-    auto* globalObject = reinterpret_cast<Zig::GlobalObject*>(globalObject_);
     auto clientData = WebCore::clientData(vm);
-
     auto& builtinNames = clientData->builtinNames();
 
-    this->putDirect(vm, builtinNames.filePublicName(), jsEmptyString(vm), 0);
-    this->putDirect(vm, builtinNames.dirPublicName(), jsEmptyString(vm), 0);
-    this->putDirect(vm, builtinNames.pathPublicName(), jsEmptyString(vm), 0);
-    this->putDirect(vm, builtinNames.urlPublicName(), jsEmptyString(vm), 0);
-
-    this->putDirect(
-        vm,
-        builtinNames.mainPublicName(),
-        GetterSetter::create(vm, globalObject, JSFunction::create(vm, importMetaObjectMainCodeGenerator(vm), globalObject), nullptr),
-        JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Accessor | JSC::PropertyAttribute::Builtin | 0);
-
-    this->putDirect(vm, Identifier::fromString(vm, "primordials"_s), jsUndefined(), JSC::PropertyAttribute::DontEnum | 0);
-
-    String requireString = "[[require]]"_s;
-    this->putDirect(vm, builtinNames.requirePublicName(), Zig::ImportMetaObject::createRequireFunction(vm, globalObject, requireString), PropertyAttribute::Builtin | PropertyAttribute::Function | 0);
-
-    this->putDirectNativeFunction(vm, globalObject, builtinNames.resolvePublicName(), 1,
-        functionImportMeta__resolve,
-        ImplementationVisibility::Public,
-        NoIntrinsic,
-        JSC::PropertyAttribute::Function | 0);
-    this->putDirectNativeFunction(
-        vm, globalObject, builtinNames.resolveSyncPublicName(),
-        1,
-        functionImportMeta__resolveSync,
-        ImplementationVisibility::Public,
-        NoIntrinsic,
-        JSC::PropertyAttribute::Function | 0);
-
-    JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+    return Structure::create(vm, globalObject, prototype, TypeInfo(ObjectType, StructureFlags), ImportMetaObject::info());
 }
 
 void ImportMetaObject::finishCreation(VM& vm)
 {
     Base::finishCreation(vm);
     ASSERT(inherits(info()));
+
+    this->requireProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSFunction>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView path;
+        if (url.protocolIs("file"_s)) {
+            path = url.fileSystemPath();
+        } else {
+            path = url.path();
+        }
+
+        JSFunction* value = jsCast<JSFunction*>(ImportMetaObject::createRequireFunction(init.vm, meta->globalObject(), path.toString()));
+        init.set(value);
+    });
+    this->urlProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+
+        init.set(jsString(init.vm, url.string()));
+    });
+    this->dirProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView dirname;
+
+        if (url.protocolIs("file"_s)) {
+            dirname = url.fileSystemPath();
+        } else {
+            dirname = url.path();
+        }
+
+        if (dirname.endsWith("/"_s)) {
+            dirname = dirname.substring(0, dirname.length() - 1);
+        } else if (dirname.contains('/')) {
+            dirname = dirname.substring(0, dirname.reverseFind('/'));
+        }
+
+        init.set(jsString(init.vm, dirname.toString()));
+    });
+    this->fileProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView path;
+        if (url.protocolIs("file"_s)) {
+            path = url.fileSystemPath();
+        } else {
+            path = url.path();
+        }
+
+        WTF::StringView filename;
+
+        if (path.endsWith("/"_s)) {
+            filename = path.substring(path.reverseFind('/', path.length() - 2) + 1);
+        } else {
+            filename = path.substring(path.reverseFind('/') + 1);
+        }
+
+        init.set(jsString(init.vm, filename.toString()));
+    });
+    this->pathProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView path;
+
+        if (url.protocolIs("file"_s)) {
+            path = url.fileSystemPath();
+        } else {
+            path = url.path();
+        }
+
+        init.set(jsString(init.vm, path.toString()));
+    });
+}
+
+template<typename Visitor>
+void ImportMetaObject::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    ImportMetaObject* fn = jsCast<ImportMetaObject*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(fn, info());
+    Base::visitChildren(fn, visitor);
+
+    fn->requireProperty.visit(visitor);
+    fn->urlProperty.visit(visitor);
+    fn->dirProperty.visit(visitor);
+    fn->fileProperty.visit(visitor);
+    fn->pathProperty.visit(visitor);
 }
 
+DEFINE_VISIT_CHILDREN(ImportMetaObject);
+
 void ImportMetaObject::analyzeHeap(JSCell* cell, HeapAnalyzer& analyzer)
 {
     auto* thisObject = jsCast<ImportMetaObject*>(cell);
@@ -475,9 +685,6 @@ void ImportMetaObject::analyzeHeap(JSCell* cell, HeapAnalyzer& analyzer)
     Base::analyzeHeap(cell, analyzer);
 }
 
-const JSC::ClassInfo ImportMetaObjectPrototype::s_info = { "ImportMeta"_s, &Base::s_info, nullptr, nullptr,
-    CREATE_METHOD_TABLE(ImportMetaObjectPrototype) };
-
 const JSC::ClassInfo ImportMetaObject::s_info = { "ImportMeta"_s, &Base::s_info, nullptr, nullptr,
     CREATE_METHOD_TABLE(ImportMetaObject) };
 }
diff --git a/src/bun.js/bindings/ImportMetaObject.h b/src/bun.js/bindings/ImportMetaObject.h
index d0f8f0963..6b5661039 100644
--- a/src/bun.js/bindings/ImportMetaObject.h
+++ b/src/bun.js/bindings/ImportMetaObject.h
@@ -9,6 +9,7 @@
 #include "JSDOMWrapperCache.h"
 
 extern "C" JSC_DECLARE_HOST_FUNCTION(functionImportMeta__resolveSync);
+extern "C" JSC_DECLARE_HOST_FUNCTION(functionImportMeta__resolveSyncPrivate);
 extern "C" JSC::EncodedJSValue Bun__resolve(JSC::JSGlobalObject* global, JSC::EncodedJSValue specifier, JSC::EncodedJSValue from, bool is_esm);
 extern "C" JSC::EncodedJSValue Bun__resolveSync(JSC::JSGlobalObject* global, JSC::EncodedJSValue specifier, JSC::EncodedJSValue from, bool is_esm);
 extern "C" JSC::EncodedJSValue Bun__resolveSyncWithSource(JSC::JSGlobalObject* global, JSC::EncodedJSValue specifier, BunString* from, bool is_esm);
@@ -18,97 +19,56 @@ namespace Zig {
 using namespace JSC;
 using namespace WebCore;
 
-class ImportMetaObject final : public JSC::JSDestructibleObject {
+JSC_DECLARE_CUSTOM_GETTER(jsRequireCacheGetter);
+JSC_DECLARE_CUSTOM_SETTER(jsRequireCacheSetter);
+
+class ImportMetaObject final : public JSC::JSNonFinalObject {
 public:
-    using Base = JSC::JSDestructibleObject;
+    using Base = JSC::JSNonFinalObject;
 
-    static ImportMetaObject* create(JSC::VM& vm, JSGlobalObject* globalObject, JSC::Structure* structure)
-    {
-        ImportMetaObject* ptr = new (NotNull, JSC::allocateCell<ImportMetaObject>(vm)) ImportMetaObject(vm, globalObject, structure);
-        ptr->finishCreation(vm);
-        return ptr;
-    }
+    static ImportMetaObject* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, const WTF::String& url);
 
-    static JSC::Structure* createResolveFunctionStructure(JSC::VM& vm, Zig::GlobalObject* globalObject);
-    static JSValue createResolveFunctionPrototype(JSC::VM& vm, Zig::GlobalObject* globalObject);
+    static JSC::JSObject* createRequireFunctionUnbound(JSC::VM& vm, JSGlobalObject* globalObject);
+    static JSC::JSObject* createRequireResolveFunctionUnbound(JSC::VM& vm, JSGlobalObject* globalObject);
     static JSObject* createRequireFunction(VM& vm, JSGlobalObject* lexicalGlobalObject, const WTF::String& pathString);
 
-    static ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSC::JSValue key);
-
-    static inline Zig::ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSC::JSString* keyString)
-    {
-        // TODO: optimize this by reusing the same JSC::Structure object and using putDirectOffset
-        auto& vm = globalObject->vm();
-        auto view = keyString->value(globalObject);
-        JSC::Structure* structure = WebCore::getDOMStructure<Zig::ImportMetaObject>(vm, *reinterpret_cast<Zig::GlobalObject*>(globalObject));
-        Zig::ImportMetaObject* metaProperties = Zig::ImportMetaObject::create(vm, globalObject, structure);
-        if (UNLIKELY(!metaProperties)) {
-            return nullptr;
-        }
-
-        auto clientData = WebCore::clientData(vm);
-        auto& builtinNames = clientData->builtinNames();
-
-        auto index = view.reverseFind('/', view.length());
-        if (index != WTF::notFound) {
-            metaProperties->putDirect(vm, builtinNames.dirPublicName(),
-                JSC::jsSubstring(globalObject, keyString, 0, index));
-            metaProperties->putDirect(
-                vm, builtinNames.filePublicName(),
-                JSC::jsSubstring(globalObject, keyString, index + 1, view.length() - index - 1));
-        } else {
-            metaProperties->putDirect(vm, builtinNames.filePublicName(), keyString);
-        }
-        metaProperties->putDirect(
-            vm,
-            builtinNames.pathPublicName(),
-            keyString,
-            0);
-
-        metaProperties->putDirect(
-            vm,
-            builtinNames.requirePublicName(),
-            Zig::ImportMetaObject::createRequireFunction(vm, globalObject, view),
-            PropertyAttribute::Builtin | PropertyAttribute::Function | 0);
-
-        if (view.startsWith('/')) {
-            metaProperties->putDirect(vm, builtinNames.urlPublicName(), JSC::JSValue(JSC::jsString(vm, WTF::URL::fileURLWithFileSystemPath(view).string())));
-        } else {
-            if (view.startsWith("node:"_s) || view.startsWith("bun:"_s)) {
-                metaProperties->putDirect(globalObject->vm(), JSC::Identifier::fromString(globalObject->vm(), "primordials"_s), reinterpret_cast<Zig::GlobalObject*>(globalObject)->primordialsObject());
-            }
-            metaProperties->putDirect(vm, builtinNames.urlPublicName(), keyString);
-        }
-
-        return metaProperties;
-    }
+    static ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSC::JSString* keyString);
+    static ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSValue keyString);
 
     DECLARE_INFO;
+    DECLARE_VISIT_CHILDREN;
 
-    static constexpr bool needsDestruction = true;
-
-    template<typename CellType, SubspaceAccess>
-    static CompleteSubspace* subspaceFor(VM& vm)
+    template<typename, JSC::SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
-        return &vm.destructibleObjectSpace();
-    }
+        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+            return nullptr;
 
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
-    {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
+        return WebCore::subspaceForImpl<ImportMetaObject, UseCustomHeapCellType::No>(
+            vm,
+            [](auto& spaces) { return spaces.m_clientSubspaceForImportMeta.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForImportMeta = std::forward<decltype(space)>(space); },
+            [](auto& spaces) { return spaces.m_subspaceForImportMeta.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_subspaceForImportMeta = std::forward<decltype(space)>(space); });
     }
 
-    static JSObject* createPrototype(VM& vm, JSDOMGlobalObject& globalObject);
+    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject);
     static void analyzeHeap(JSCell*, JSC::HeapAnalyzer&);
 
+    WTF::String url;
+    LazyProperty<JSObject, JSFunction> requireProperty;
+    LazyProperty<JSObject, JSString> dirProperty;
+    LazyProperty<JSObject, JSString> urlProperty;
+    LazyProperty<JSObject, JSString> fileProperty;
+    LazyProperty<JSObject, JSString> pathProperty;
+
 private:
-    ImportMetaObject(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+    ImportMetaObject(JSC::VM& vm, JSC::Structure* structure, const WTF::String& url)
         : Base(vm, structure)
+        , url(url)
     {
     }
 
     void finishCreation(JSC::VM&);
 };
-STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(ImportMetaObject, ImportMetaObject::Base);
 
 }
 \ No newline at end of file
diff --git a/src/bun.js/bindings/JSBuffer.cpp b/src/bun.js/bindings/JSBuffer.cpp
index 00965da89..e420e24ef 100644
--- a/src/bun.js/bindings/JSBuffer.cpp
+++ b/src/bun.js/bindings/JSBuffer.cpp
@@ -1436,43 +1436,71 @@ static inline JSC::EncodedJSValue jsBufferPrototypeFunction_toStringBody(JSC::JS
     if (length == 0)
         return JSC::JSValue::encode(JSC::jsEmptyString(vm));
 
-    switch (callFrame->argumentCount()) {
-    case 0: {
-        break;
-    }
-    case 2:
-    case 3:
-    case 1: {
-        EnsureStillAliveScope arg1 = callFrame->uncheckedArgument(0);
-        if (!arg1.value().isUndefined()) {
-            encoding = parseEncoding(lexicalGlobalObject, scope, arg1.value());
+    size_t argsCount = callFrame->argumentCount();
+
+    JSC::JSValue arg1 = callFrame->argument(0);
+    JSC::JSValue arg2 = callFrame->argument(1);
+    JSC::JSValue arg3 = callFrame->argument(2);
+
+    // This method could be called in following forms:
+    // - toString()
+    // - toString(encoding)
+    // - toString(encoding, start)
+    // - toString(encoding, start, end)
+    // - toString(offset, length)
+    // - toString(offset, length, encoding)
+    if (argsCount == 0)
+        return jsBufferToString(vm, lexicalGlobalObject, castedThis, offset, length, encoding);
+
+    if (arg1.isString()) {
+        encoding = parseEncoding(lexicalGlobalObject, scope, arg1);
+        RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+
+        if (!arg3.isUndefined()) {
+            // length is end
+            length = std::min(byteLength, static_cast<uint32_t>(arg3.toInt32(lexicalGlobalObject)));
             RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
         }
-        if (callFrame->argumentCount() == 1)
-            break;
-    }
-    // any
-    case 5: {
-        JSC::JSValue arg2 = callFrame->uncheckedArgument(1);
-        int32_t ioffset = arg2.toInt32(lexicalGlobalObject);
+
+        int32_t istart = 0;
+
+        if (!arg2.isUndefined()) {
+            istart = arg2.toInt32(lexicalGlobalObject);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+        }
+
+        if (istart < 0) {
+            throwTypeError(lexicalGlobalObject, scope, "Start must be a positive integer"_s);
+            return JSC::JSValue::encode(jsUndefined());
+        }
+        offset = static_cast<uint32_t>(istart);
+        length = (length > offset) ? (length - offset) : 0;
+    } else {
+
+        int32_t ioffset = 0;
+
+        if (!arg1.isUndefined()) {
+            ioffset = arg1.toInt32(lexicalGlobalObject);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+        }
+
         if (ioffset < 0) {
             throwTypeError(lexicalGlobalObject, scope, "Offset must be a positive integer"_s);
             return JSC::JSValue::encode(jsUndefined());
         }
+
         offset = static_cast<uint32_t>(ioffset);
+        length = (length > offset) ? (length - offset) : 0;
 
-        if (callFrame->argumentCount() == 2)
-            break;
-    }
+        if (!arg3.isUndefined()) {
+            encoding = parseEncoding(lexicalGlobalObject, scope, arg3);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+        }
 
-    default: {
-        length = std::min(byteLength, static_cast<uint32_t>(callFrame->argument(2).toInt32(lexicalGlobalObject)));
-        break;
-    }
+        if (!arg2.isUndefined())
+            length = std::min(length, static_cast<uint32_t>(arg2.toInt32(lexicalGlobalObject)));
     }
 
-    length -= std::min(offset, length);
-
     return jsBufferToString(vm, lexicalGlobalObject, castedThis, offset, length, encoding);
 }
 
@@ -1662,14 +1690,6 @@ JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_compare, (JSGlobalObject *
     return jsBufferConstructorFunction_compareBody(lexicalGlobalObject, callFrame);
 }
 
-JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_isBuffer, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
-{
-    if (callFrame->argumentCount() < 1)
-        return JSC::JSValue::encode(JSC::jsBoolean(false));
-
-    return JSC::JSValue::encode(JSC::jsBoolean(JSBuffer__isBuffer(lexicalGlobalObject, JSC::JSValue::encode(callFrame->uncheckedArgument(0)))));
-}
-
 JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_concat, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     return jsBufferConstructorFunction_concatBody(lexicalGlobalObject, callFrame);
@@ -1678,24 +1698,6 @@ JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_concat, (JSGlobalObject * l
 extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorAllocWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int size));
 extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorAllocUnsafeWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int size));
 extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorAllocUnsafeSlowWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int size));
-extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorIsBufferWithoutTypeChecks, JSValue, (JSC::JSGlobalObject * lexicalGlobalObject, void*, JSUint8Array* value));
-
-static bool isBufferWithCell(JSC::JSGlobalObject* lexicalGlobalObject, JSC::JSUint8Array* cell)
-{
-    auto& vm = lexicalGlobalObject->vm();
-    JSValue prototype = cell->getPrototype(vm, lexicalGlobalObject);
-    return prototype.inherits<JSBufferPrototype>();
-}
-
-JSC_DEFINE_JIT_OPERATION(jsBufferConstructorIsBufferWithoutTypeChecks, JSValue, (JSC::JSGlobalObject * lexicalGlobalObject, void* ctx, JSUint8Array* thisValue))
-{
-    VM& vm = JSC::getVM(lexicalGlobalObject);
-    IGNORE_WARNINGS_BEGIN("frame-address")
-    CallFrame* callFrame = DECLARE_CALL_FRAME(vm);
-    IGNORE_WARNINGS_END
-    JSC::JITOperationPrologueCallFrameTracer tracer(vm, callFrame);
-    return jsBoolean(isBufferWithCell(lexicalGlobalObject, thisValue));
-}
 
 JSC_DEFINE_JIT_OPERATION(jsBufferConstructorAllocWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int byteLength))
 {
@@ -1849,11 +1851,17 @@ static const HashTableValue JSBufferPrototypeTableValues[]
           { "readUInt8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt8CodeGenerator, 1 } },
           { "readUIntBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntBECodeGenerator, 1 } },
           { "readUIntLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntLECodeGenerator, 1 } },
+          // name alias
+          { "readUintBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntBECodeGenerator, 1 } },
+          { "readUintLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntLECodeGenerator, 1 } },
+          { "readUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt8CodeGenerator, 1 } },
           { "readUint16BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt16BECodeGenerator, 1 } },
           { "readUint16LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt16LECodeGenerator, 1 } },
           { "readUint32BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt32BECodeGenerator, 1 } },
           { "readUint32LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt32LECodeGenerator, 1 } },
-          { "readUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt8CodeGenerator, 1 } },
+          { "readBigUint64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadBigUInt64BECodeGenerator, 1 } },
+          { "readBigUint64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadBigUInt64LECodeGenerator, 1 } },
+
           { "slice"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeSliceCodeGenerator, 2 } },
           { "subarray"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeSliceCodeGenerator, 2 } },
           { "swap16"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferPrototypeFunction_swap16, 0 } },
@@ -1873,8 +1881,6 @@ static const HashTableValue JSBufferPrototypeTableValues[]
           { "writeBigInt64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigInt64LECodeGenerator, 1 } },
           { "writeBigUInt64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64BECodeGenerator, 1 } },
           { "writeBigUInt64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64LECodeGenerator, 1 } },
-          { "writeBigUint64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64BECodeGenerator, 1 } },
-          { "writeBigUint64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64LECodeGenerator, 1 } },
           { "writeDouble"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteDoubleLECodeGenerator, 1 } },
           { "writeDoubleBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteDoubleBECodeGenerator, 1 } },
           { "writeDoubleLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteDoubleLECodeGenerator, 1 } },
@@ -1897,13 +1903,18 @@ static const HashTableValue JSBufferPrototypeTableValues[]
           { "writeUInt8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt8CodeGenerator, 1 } },
           { "writeUIntBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntBECodeGenerator, 1 } },
           { "writeUIntLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntLECodeGenerator, 1 } },
+          // name alias
+          { "writeUintBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntBECodeGenerator, 1 } },
+          { "writeUintLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntLECodeGenerator, 1 } },
+          { "writeUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt8CodeGenerator, 1 } },
           { "writeUint16"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt16LECodeGenerator, 1 } },
           { "writeUint16BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt16BECodeGenerator, 1 } },
           { "writeUint16LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt16LECodeGenerator, 1 } },
           { "writeUint32"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt32LECodeGenerator, 1 } },
           { "writeUint32BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt32BECodeGenerator, 1 } },
           { "writeUint32LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt32LECodeGenerator, 1 } },
-          { "writeUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt8CodeGenerator, 1 } },
+          { "writeBigUint64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64BECodeGenerator, 1 } },
+          { "writeBigUint64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64LECodeGenerator, 1 } },
       };
 
 void JSBufferPrototype::finishCreation(VM& vm, JSC::JSGlobalObject* globalThis)
@@ -1928,11 +1939,6 @@ static const JSC::DOMJIT::Signature DOMJITSignaturejsBufferConstructorAlloc(jsBu
     JSC::DOMJIT::Effect::forWriteKinds(JSC::DFG::AbstractHeapKind::Heap),
     JSC::SpecUint8Array, JSC::SpecInt32Only);
 
-static const JSC::DOMJIT::Signature DOMJITSignaturejsBufferConstructorIsBuffer(jsBufferConstructorIsBufferWithoutTypeChecks,
-    JSBufferConstructor::info(),
-    JSC::DOMJIT::Effect::forPure(),
-    JSC::SpecOther, JSC::SpecUint8Array);
-
 static const JSC::DOMJIT::Signature DOMJITSignaturejsBufferConstructorAllocUnsafe(jsBufferConstructorAllocUnsafeWithoutTypeChecks,
     JSBufferConstructor::info(),
     JSC::DOMJIT::Effect::forWriteKinds(JSC::DFG::AbstractHeapKind::Heap),
@@ -1954,7 +1960,7 @@ static const HashTableValue JSBufferConstructorTableValues[] = {
     { "compare"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_compare, 2 } },
     { "concat"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_concat, 2 } },
     { "from"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferConstructorFromCodeGenerator, 1 } },
-    { "isBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction), NoIntrinsic, { HashTableValue::DOMJITFunctionType, jsBufferConstructorFunction_isBuffer, &DOMJITSignaturejsBufferConstructorIsBuffer } },
+    { "isBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferConstructorIsBufferCodeGenerator, 1 } },
     { "toBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_toBuffer, 1 } },
     { "isEncoding"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_isEncoding, 1 } },
 };
diff --git a/src/bun.js/bindings/JSBundlerPlugin.cpp b/src/bun.js/bindings/JSBundlerPlugin.cpp
index cae6a4b22..ec3933574 100644
--- a/src/bun.js/bindings/JSBundlerPlugin.cpp
+++ b/src/bun.js/bindings/JSBundlerPlugin.cpp
@@ -54,7 +54,7 @@ void BundlerPlugin::NamespaceList::append(JSC::VM& vm, JSC::RegExp* filter, Stri
     nsGroup->append(WTFMove(regex));
 }
 
-bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespaceStr, const ZigString* path, bool isOnLoad)
+bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const BunString* namespaceStr, const BunString* path, bool isOnLoad)
 {
     constexpr bool usesPatternContextBuffer = false;
     if (isOnLoad) {
@@ -62,7 +62,7 @@ bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespac
             return false;
 
         // Avoid unnecessary string copies
-        auto namespaceString = namespaceStr ? Zig::toString(*namespaceStr) : String();
+        auto namespaceString = namespaceStr ? Bun::toWTFString(*namespaceStr) : String();
 
         auto* group = this->onLoad.group(namespaceString);
         if (group == nullptr) {
@@ -70,7 +70,7 @@ bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespac
         }
 
         auto& filters = *group;
-        auto pathString = Zig::toString(*path);
+        auto pathString = Bun::toWTFString(*path);
 
         for (auto& filter : filters) {
             Yarr::MatchingContextHolder regExpContext(vm, usesPatternContextBuffer, nullptr, Yarr::MatchFrom::CompilerThread);
@@ -84,14 +84,14 @@ bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespac
             return false;
 
         // Avoid unnecessary string copies
-        auto namespaceString = namespaceStr ? Zig::toString(*namespaceStr) : String();
+        auto namespaceString = namespaceStr ? Bun::toWTFString(*namespaceStr) : String();
 
         auto* group = this->onResolve.group(namespaceString);
         if (group == nullptr) {
             return false;
         }
 
-        auto pathString = Zig::toString(*path);
+        auto pathString = Bun::toWTFString(*path);
         auto& filters = *group;
 
         for (auto& filter : filters) {
@@ -115,9 +115,19 @@ static const HashTableValue JSBundlerPluginHashTable[] = {
 class JSBundlerPlugin final : public JSC::JSNonFinalObject {
 public:
     using Base = JSC::JSNonFinalObject;
-    static JSBundlerPlugin* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, void* config, BunPluginTarget target)
+    static JSBundlerPlugin* create(JSC::VM& vm,
+        JSC::JSGlobalObject* globalObject,
+        JSC::Structure* structure,
+        void* config,
+        BunPluginTarget target,
+        JSBundlerPluginAddErrorCallback addError = JSBundlerPlugin__addError,
+        JSBundlerPluginOnLoadAsyncCallback onLoadAsync = JSBundlerPlugin__onLoadAsync,
+        JSBundlerPluginOnResolveAsyncCallback onResolveAsync = JSBundlerPlugin__onResolveAsync)
     {
-        JSBundlerPlugin* ptr = new (NotNull, JSC::allocateCell<JSBundlerPlugin>(vm)) JSBundlerPlugin(vm, globalObject, structure, config, target);
+        JSBundlerPlugin* ptr = new (NotNull, JSC::allocateCell<JSBundlerPlugin>(vm)) JSBundlerPlugin(vm, globalObject, structure, config, target,
+            addError,
+            onLoadAsync,
+            onResolveAsync);
         ptr->finishCreation(vm);
         return ptr;
     }
@@ -147,9 +157,10 @@ public:
     JSC::LazyProperty<JSBundlerPlugin, JSC::JSFunction> setupFunction;
 
 private:
-    JSBundlerPlugin(JSC::VM& vm, JSC::JSGlobalObject*, JSC::Structure* structure, void* config, BunPluginTarget target)
+    JSBundlerPlugin(JSC::VM& vm, JSC::JSGlobalObject*, JSC::Structure* structure, void* config, BunPluginTarget target,
+        JSBundlerPluginAddErrorCallback addError, JSBundlerPluginOnLoadAsyncCallback onLoadAsync, JSBundlerPluginOnResolveAsyncCallback onResolveAsync)
         : JSC::JSNonFinalObject(vm, structure)
-        , plugin(BundlerPlugin(config, target))
+        , plugin(BundlerPlugin(config, target, addError, onLoadAsync, onResolveAsync))
     {
     }
 
@@ -199,7 +210,7 @@ JSC_DEFINE_HOST_FUNCTION(jsBundlerPluginFunction_addError, (JSC::JSGlobalObject
 {
     JSBundlerPlugin* thisObject = jsCast<JSBundlerPlugin*>(callFrame->thisValue());
     if (!thisObject->plugin.tombstoned) {
-        JSBundlerPlugin__addError(
+        thisObject->plugin.addError(
             UNWRAP_BUNDLER_PLUGIN(callFrame),
             thisObject->plugin.config,
             JSValue::encode(callFrame->argument(1)),
@@ -212,7 +223,7 @@ JSC_DEFINE_HOST_FUNCTION(jsBundlerPluginFunction_onLoadAsync, (JSC::JSGlobalObje
 {
     JSBundlerPlugin* thisObject = jsCast<JSBundlerPlugin*>(callFrame->thisValue());
     if (!thisObject->plugin.tombstoned) {
-        JSBundlerPlugin__onLoadAsync(
+        thisObject->plugin.onLoadAsync(
             UNWRAP_BUNDLER_PLUGIN(callFrame),
             thisObject->plugin.config,
             JSValue::encode(callFrame->argument(1)),
@@ -225,7 +236,7 @@ JSC_DEFINE_HOST_FUNCTION(jsBundlerPluginFunction_onResolveAsync, (JSC::JSGlobalO
 {
     JSBundlerPlugin* thisObject = jsCast<JSBundlerPlugin*>(callFrame->thisValue());
     if (!thisObject->plugin.tombstoned) {
-        JSBundlerPlugin__onResolveAsync(
+        thisObject->plugin.onResolveAsync(
             UNWRAP_BUNDLER_PLUGIN(callFrame),
             thisObject->plugin.config,
             JSValue::encode(callFrame->argument(1)),
@@ -274,15 +285,15 @@ void JSBundlerPlugin::finishCreation(JSC::VM& vm)
     reifyStaticProperties(vm, JSBundlerPlugin::info(), JSBundlerPluginHashTable, *this);
 }
 
-extern "C" bool JSBundlerPlugin__anyMatches(Bun::JSBundlerPlugin* pluginObject, const ZigString* namespaceString, const ZigString* path, bool isOnLoad)
+extern "C" bool JSBundlerPlugin__anyMatches(Bun::JSBundlerPlugin* pluginObject, const BunString* namespaceString, const BunString* path, bool isOnLoad)
 {
     return pluginObject->plugin.anyMatchesCrossThread(pluginObject->vm(), namespaceString, path, isOnLoad);
 }
 
-extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const ZigString* namespaceString, const ZigString* path, void* context, uint8_t defaultLoaderId)
+extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const BunString* namespaceString, const BunString* path, void* context, uint8_t defaultLoaderId)
 {
-    WTF::String namespaceStringStr = namespaceString ? Zig::toStringCopy(*namespaceString) : WTF::String();
-    WTF::String pathStr = path ? Zig::toStringCopy(*path) : WTF::String();
+    WTF::String namespaceStringStr = namespaceString ? Bun::toWTFString(*namespaceString) : WTF::String();
+    WTF::String pathStr = path ? Bun::toWTFString(*path) : WTF::String();
 
     JSFunction* function = plugin->onLoadFunction.get(plugin);
     if (UNLIKELY(!function))
@@ -306,7 +317,7 @@ extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject,
         auto exception = scope.exception();
         scope.clearException();
         if (!plugin->plugin.tombstoned) {
-            JSBundlerPlugin__addError(
+            plugin->plugin.addError(
                 context,
                 plugin->plugin.config,
                 JSC::JSValue::encode(exception),
@@ -315,14 +326,14 @@ extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject,
     }
 }
 
-extern "C" void JSBundlerPlugin__matchOnResolve(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const ZigString* namespaceString, const ZigString* path, const ZigString* importer, void* context, uint8_t kindId)
+extern "C" void JSBundlerPlugin__matchOnResolve(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const BunString* namespaceString, const BunString* path, const BunString* importer, void* context, uint8_t kindId)
 {
-    WTF::String namespaceStringStr = namespaceString ? Zig::toStringCopy(*namespaceString) : WTF::String("file"_s);
+    WTF::String namespaceStringStr = namespaceString ? Bun::toWTFString(*namespaceString) : WTF::String("file"_s);
     if (namespaceStringStr.length() == 0) {
         namespaceStringStr = WTF::String("file"_s);
     }
-    WTF::String pathStr = path ? Zig::toStringCopy(*path) : WTF::String();
-    WTF::String importerStr = importer ? Zig::toStringCopy(*importer) : WTF::String();
+    WTF::String pathStr = path ? Bun::toWTFString(*path) : WTF::String();
+    WTF::String importerStr = importer ? Bun::toWTFString(*importer) : WTF::String();
     auto& vm = globalObject->vm();
 
     JSFunction* function = plugin->onResolveFunction.get(plugin);
diff --git a/src/bun.js/bindings/JSBundlerPlugin.h b/src/bun.js/bindings/JSBundlerPlugin.h
index 08aa1d140..4d82cdc1b 100644
--- a/src/bun.js/bindings/JSBundlerPlugin.h
+++ b/src/bun.js/bindings/JSBundlerPlugin.h
@@ -9,6 +9,10 @@
 #include <JavaScriptCore/Yarr.h>
 #include <JavaScriptCore/Strong.h>
 
+typedef void (*JSBundlerPluginAddErrorCallback)(void*, void*, JSC::EncodedJSValue, JSC::EncodedJSValue);
+typedef void (*JSBundlerPluginOnLoadAsyncCallback)(void*, void*, JSC::EncodedJSValue, JSC::EncodedJSValue);
+typedef void (*JSBundlerPluginOnResolveAsyncCallback)(void*, void*, JSC::EncodedJSValue, JSC::EncodedJSValue, JSC::EncodedJSValue);
+
 namespace Bun {
 
 using namespace JSC;
@@ -42,10 +46,13 @@ public:
     };
 
 public:
-    bool anyMatchesCrossThread(JSC::VM&, const ZigString* namespaceStr, const ZigString* path, bool isOnLoad);
+    bool anyMatchesCrossThread(JSC::VM&, const BunString* namespaceStr, const BunString* path, bool isOnLoad);
     void tombstone() { tombstoned = true; }
 
-    BundlerPlugin(void* config, BunPluginTarget target)
+    BundlerPlugin(void* config, BunPluginTarget target, JSBundlerPluginAddErrorCallback addError, JSBundlerPluginOnLoadAsyncCallback onLoadAsync, JSBundlerPluginOnResolveAsyncCallback onResolveAsync)
+        : addError(addError)
+        , onLoadAsync(onLoadAsync)
+        , onResolveAsync(onResolveAsync)
     {
         this->target = target;
         this->config = config;
@@ -54,6 +61,10 @@ public:
     NamespaceList onLoad = {};
     NamespaceList onResolve = {};
     BunPluginTarget target { BunPluginTargetBrowser };
+
+    JSBundlerPluginAddErrorCallback addError;
+    JSBundlerPluginOnLoadAsyncCallback onLoadAsync;
+    JSBundlerPluginOnResolveAsyncCallback onResolveAsync;
     void* config { nullptr };
     bool tombstoned { false };
 };
diff --git a/src/bun.js/bindings/JSEnvironmentVariableMap.cpp b/src/bun.js/bindings/JSEnvironmentVariableMap.cpp
index 5c0357066..4989f7e96 100644
--- a/src/bun.js/bindings/JSEnvironmentVariableMap.cpp
+++ b/src/bun.js/bindings/JSEnvironmentVariableMap.cpp
@@ -30,7 +30,7 @@ JSC_DEFINE_CUSTOM_GETTER(jsGetterEnvironmentVariable, (JSGlobalObject * globalOb
     if (UNLIKELY(name.len == 0))
         return JSValue::encode(jsUndefined());
 
-    if (!Bun__getEnvValue(globalObject, &name, &value) || value.len == 0) {
+    if (!Bun__getEnvValue(globalObject, &name, &value)) {
         return JSValue::encode(jsUndefined());
     }
 
@@ -144,4 +144,4 @@ JSValue createEnvironmentVariablesMap(Zig::GlobalObject* globalObject)
 
     return object;
 }
-}
-\ No newline at end of file
+}
diff --git a/src/bun.js/bindings/JSMockFunction.cpp b/src/bun.js/bindings/JSMockFunction.cpp
index b7c2659b4..3a84f0139 100644
--- a/src/bun.js/bindings/JSMockFunction.cpp
+++ b/src/bun.js/bindings/JSMockFunction.cpp
@@ -19,6 +19,7 @@
 #include <JavaScriptCore/WeakMapImpl.h>
 #include <JavaScriptCore/WeakMapImplInlines.h>
 #include <JavaScriptCore/FunctionPrototype.h>
+#include <JavaScriptCore/DateInstance.h>
 
 namespace Bun {
 
@@ -65,6 +66,41 @@ JSC_DECLARE_HOST_FUNCTION(jsMockFunctionMockRejectedValueOnce);
 JSC_DECLARE_HOST_FUNCTION(jsMockFunctionWithImplementationCleanup);
 JSC_DECLARE_HOST_FUNCTION(jsMockFunctionWithImplementation);
 
+// This is a stub. Exists so that the same code can be run in Jest
+extern "C" EncodedJSValue JSMock__jsUseFakeTimers(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    return JSValue::encode(callFrame->thisValue());
+}
+
+extern "C" EncodedJSValue JSMock__jsUseRealTimers(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    globalObject->overridenDateNow = -1;
+    return JSValue::encode(callFrame->thisValue());
+}
+
+extern "C" EncodedJSValue JSMock__jsNow(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    return JSValue::encode(jsNumber(globalObject->jsDateNow()));
+}
+extern "C" EncodedJSValue JSMock__jsSetSystemTime(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    JSValue argument0 = callFrame->argument(0);
+
+    if (auto* dateInstance = jsDynamicCast<DateInstance*>(argument0)) {
+        if (std::isnormal(dateInstance->internalNumber())) {
+            globalObject->overridenDateNow = dateInstance->internalNumber();
+        }
+        return JSValue::encode(callFrame->thisValue());
+    }
+
+    if (argument0.isNumber() && argument0.asNumber() > 0) {
+        globalObject->overridenDateNow = argument0.asNumber();
+    }
+
+    globalObject->overridenDateNow = -1;
+    return JSValue::encode(callFrame->thisValue());
+}
+
 uint64_t JSMockModule::s_nextInvocationId = 0;
 
 // This is taken from JSWeakSet
@@ -391,6 +427,7 @@ void JSMockFunction::visitChildrenImpl(JSCell* cell, Visitor& visitor)
     visitor.append(fn->instances);
     visitor.append(fn->returnValues);
     visitor.append(fn->invocationCallOrder);
+    visitor.append(fn->spyOriginal);
     fn->mock.visit(visitor);
 }
 DEFINE_VISIT_CHILDREN(JSMockFunction);
@@ -526,13 +563,13 @@ extern "C" void JSMock__resetSpies(Zig::GlobalObject* globalObject)
     globalObject->mockModule.activeSpies.clear();
 }
 
-extern "C" EncodedJSValue jsFunctionResetSpies(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callframe)
+extern "C" EncodedJSValue JSMock__jsRestoreAllMocks(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callframe)
 {
     JSMock__resetSpies(jsCast<Zig::GlobalObject*>(globalObject));
     return JSValue::encode(jsUndefined());
 }
 
-extern "C" EncodedJSValue JSMock__spyOn(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callframe)
+extern "C" EncodedJSValue JSMock__jsSpyOn(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callframe)
 {
     auto& vm = lexicalGlobalObject->vm();
     auto scope = DECLARE_THROW_SCOPE(vm);
@@ -568,15 +605,19 @@ extern "C" EncodedJSValue JSMock__spyOn(JSC::JSGlobalObject* lexicalGlobalObject
 
     // easymode: regular property or missing property
     if (!hasValue || slot.isValue()) {
+        JSValue value = jsUndefined();
+        if (hasValue) {
+            value = slot.getValue(globalObject, propertyKey);
+            if (jsDynamicCast<JSMockFunction*>(value)) {
+                return JSValue::encode(value);
+            }
+        }
+
         auto* mock = JSMockFunction::create(vm, globalObject, globalObject->mockModule.mockFunctionStructure.getInitializedOnMainThread(globalObject), CallbackKind::GetterSetter);
         mock->spyTarget = JSC::Weak<JSObject>(object, &weakValueHandleOwner(), nullptr);
         mock->spyIdentifier = propertyKey.isSymbol() ? Identifier::fromUid(vm, propertyKey.uid()) : Identifier::fromString(vm, propertyKey.publicName());
         mock->spyAttributes = hasValue ? slot.attributes() : 0;
         unsigned attributes = 0;
-        JSValue value = jsUndefined();
-
-        if (hasValue)
-            value = slot.getValue(globalObject, propertyKey);
 
         if (hasValue && ((slot.attributes() & PropertyAttribute::Function) != 0 || (value.isCell() && value.isCallable()))) {
             if (hasValue)
@@ -963,7 +1004,7 @@ JSC_DEFINE_CUSTOM_GETTER(jsMockFunctionGetter_protoImpl, (JSC::JSGlobalObject *
     return JSValue::encode(jsUndefined());
 }
 
-JSC_DEFINE_HOST_FUNCTION(jsMockFunctionConstructor, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callframe))
+extern "C" EncodedJSValue JSMock__jsMockFn(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callframe)
 {
     auto& vm = lexicalGlobalObject->vm();
     auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
@@ -997,11 +1038,6 @@ JSC_DEFINE_HOST_FUNCTION(jsMockFunctionConstructor, (JSC::JSGlobalObject * lexic
     return JSValue::encode(thisObject);
 }
 
-extern "C" EncodedJSValue JSMockFunction__createObject(Zig::GlobalObject* globalObject)
-{
-    auto& vm = globalObject->vm();
-    return JSValue::encode(JSC::JSFunction::create(vm, globalObject, 0, "mock"_s, jsMockFunctionConstructor, ImplementationVisibility::Public));
-}
 extern "C" EncodedJSValue JSMockFunction__getCalls(EncodedJSValue encodedValue)
 {
     JSValue value = JSValue::decode(encodedValue);
diff --git a/src/bun.js/bindings/JSReadableHelper.h b/src/bun.js/bindings/JSReadableHelper.h
index 6746bcbec..3e2554c2b 100644
--- a/src/bun.js/bindings/JSReadableHelper.h
+++ b/src/bun.js/bindings/JSReadableHelper.h
@@ -8,7 +8,6 @@ JSC_DECLARE_HOST_FUNCTION(jsReadable_maybeReadMore);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_resume);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_emitReadable);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_onEofChunk);
-JSC_DECLARE_HOST_FUNCTION(jsReadable_resume_);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_emitReadable_);
 
 } // namespace WebCore
diff --git a/src/bun.js/bindings/JSReadableState.cpp b/src/bun.js/bindings/JSReadableState.cpp
index d09e30d44..1f3a36def 100644
--- a/src/bun.js/bindings/JSReadableState.cpp
+++ b/src/bun.js/bindings/JSReadableState.cpp
@@ -26,10 +26,10 @@ int64_t getHighWaterMark(JSC::VM& vm, JSC::JSGlobalObject* globalObject, bool is
     auto* clientData = WebCore::clientData(vm);
     if (JSValue highWaterMarkVal = options->getIfPropertyExists(globalObject, clientData->builtinNames().highWaterMarkPublicName())) {
         if (isDuplex && (highWaterMarkVal.isUndefined() || highWaterMarkVal.isNull())) {
-            highWaterMarkVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
+            highWaterMarkVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
         }
 
-        if (!highWaterMarkVal.isUndefinedOrNull()) {
+        if (highWaterMarkVal && highWaterMarkVal.isNumber()) {
             return highWaterMarkVal.toInt32(globalObject);
         }
     }
@@ -42,9 +42,9 @@ void JSReadableState::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj
     Base::finishCreation(vm);
 
     if (options != nullptr) {
-        JSC::JSValue objectModeVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "objectMode"_s));
+        JSC::JSValue objectModeVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "objectMode"_s));
         if (isDuplex && !objectModeVal) {
-            objectModeVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
+            objectModeVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
         }
         if (objectModeVal && objectModeVal.toBoolean(globalObject))
             setBool(JSReadableState::Mask::objectMode, true);
@@ -65,13 +65,16 @@ void JSReadableState::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj
     m_pipes.set(vm, this, JSC::constructEmptyArray(globalObject, nullptr, 0));
 
     if (options != nullptr) {
-        JSC::JSValue emitCloseVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "emitClose"_s));
-        if (!emitCloseVal.isBoolean() || emitCloseVal.toBoolean(globalObject))
+        JSC::JSValue emitCloseVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "emitClose"_s));
+        if (!emitCloseVal || emitCloseVal.toBoolean(globalObject))
             setBool(JSReadableState::Mask::emitClose, true);
         // Has it been destroyed.
-        JSC::JSValue autoDestroyVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "autoDestroy"_s));
-        if (!autoDestroyVal.isBoolean() || autoDestroyVal.toBoolean(globalObject))
+        JSC::JSValue autoDestroyVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "autoDestroy"_s));
+        if (!autoDestroyVal || autoDestroyVal.toBoolean(globalObject))
             setBool(JSReadableState::Mask::autoDestroy, true);
+    } else {
+        setBool(JSReadableState::Mask::emitClose, true);
+        setBool(JSReadableState::Mask::autoDestroy, true);
     }
 
     // Indicates whether the stream has finished destroying.
@@ -90,26 +93,25 @@ void JSReadableState::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj
     }
 
     m_awaitDrainWriters.set(vm, this, JSC::jsNull());
+    JSValue decodeValue = JSC::jsNull();
+    JSValue encodingValue = JSC::jsNull();
 
-    if (options == nullptr) {
-        m_decoder.set(vm, this, JSC::jsNull());
-        m_encoding.set(vm, this, JSC::jsNull());
-    } else {
-        JSC::JSValue encodingVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "encoding"_s));
+    if (options != nullptr) {
+        JSC::JSValue encodingVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "encoding"_s));
         if (encodingVal && encodingVal.isString()) {
             auto constructor = reinterpret_cast<Zig::GlobalObject*>(globalObject)->JSStringDecoder();
             auto constructData = JSC::getConstructData(constructor);
             MarkedArgumentBuffer args;
             args.append(encodingVal);
             JSObject* decoder = JSC::construct(globalObject, constructor, constructData, args);
-            m_decoder.set(vm, this, decoder);
-            m_encoding.set(vm, this, encodingVal);
-        } else {
-            m_decoder.set(vm, this, JSC::jsNull());
-            m_encoding.set(vm, this, JSC::jsNull());
+            decodeValue = decoder;
+            encodingValue = encodingVal;
         }
     }
 
+    m_decoder.set(vm, this, decodeValue);
+    m_encoding.set(vm, this, encodingValue);
+
     // ReadableState.constructed is set to false during construction when a _construct method is implemented
     // this is here so that the ReadableState behavior tracks the behavior in node, and that calling Readable.read
     // will work when we return early from construct because there is no Readable._construct implemented
@@ -403,10 +405,12 @@ JSC::EncodedJSValue JSReadableStateConstructor::construct(JSC::JSGlobalObject* l
         return JSValue::encode(jsUndefined());
     }
     isDuplex = isDuplexVal.toBoolean(lexicalGlobalObject);
+    RETURN_IF_EXCEPTION(throwScope, encodedJSValue());
     JSObject* options = nullptr;
-    if (optionsVal.toBoolean(lexicalGlobalObject) && optionsVal.isObject()) {
+    if (optionsVal && optionsVal.isObject()) {
         options = optionsVal.toObject(lexicalGlobalObject);
     }
+    RETURN_IF_EXCEPTION(throwScope, encodedJSValue());
 
     JSReadableState* stringDecoder = JSReadableState::create(
         vm, lexicalGlobalObject, reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject)->JSReadableStateStructure(), isDuplex, options);
diff --git a/src/bun.js/bindings/JSSink.cpp b/src/bun.js/bindings/JSSink.cpp
index 36be334dd..ed2554dc7 100644
--- a/src/bun.js/bindings/JSSink.cpp
+++ b/src/bun.js/bindings/JSSink.cpp
@@ -1,6 +1,6 @@
 
 // AUTO-GENERATED FILE. DO NOT EDIT.
-// Generated by 'make generate-sink' at 2023-05-18T01:04:00.447Z
+// Generated by 'make generate-sink' at 2023-07-06T14:22:07.346Z
 // To regenerate this file, run:
 //
 //   make generate-sink
diff --git a/src/bun.js/bindings/JSSink.h b/src/bun.js/bindings/JSSink.h
index 5bbfab777..386554ebb 100644
--- a/src/bun.js/bindings/JSSink.h
+++ b/src/bun.js/bindings/JSSink.h
@@ -1,6 +1,6 @@
 
 // AUTO-GENERATED FILE. DO NOT EDIT.
-// Generated by 'make generate-sink' at 2023-05-18T01:04:00.446Z
+// Generated by 'make generate-sink' at 2023-07-06T14:22:07.345Z
 //
 #pragma once
 
diff --git a/src/bun.js/bindings/JSSinkLookupTable.h b/src/bun.js/bindings/JSSinkLookupTable.h
index a4ace6dc3..e4ed81629 100644
--- a/src/bun.js/bindings/JSSinkLookupTable.h
+++ b/src/bun.js/bindings/JSSinkLookupTable.h
@@ -1,4 +1,4 @@
-// Automatically generated from src/bun.js/bindings/JSSink.cpp using /Users/jarred/Code/bun/src/bun.js/WebKit/Source/JavaScriptCore/create_hash_table. DO NOT EDIT!
+// Automatically generated from src/bun.js/bindings/JSSink.cpp using /home/cirospaciari/Repos/bun/src/bun.js/WebKit/Source/JavaScriptCore/create_hash_table. DO NOT EDIT!
 
 
 
diff --git a/src/bun.js/bindings/JSStringDecoder.cpp b/src/bun.js/bindings/JSStringDecoder.cpp
index 5ec258522..b8c2dd50c 100644
--- a/src/bun.js/bindings/JSStringDecoder.cpp
+++ b/src/bun.js/bindings/JSStringDecoder.cpp
@@ -129,7 +129,7 @@ uint8_t JSStringDecoder::utf8CheckIncomplete(uint8_t* bufPtr, uint32_t length, u
             m_lastNeed = nb - 1;
         return nb;
     }
-    if (--j < i || nb == -2)
+    if (j == 0 || --j < i || nb == -2)
         return 0;
     nb = utf8CheckByte(bufPtr[j]);
     if (nb >= 0) {
@@ -137,7 +137,7 @@ uint8_t JSStringDecoder::utf8CheckIncomplete(uint8_t* bufPtr, uint32_t length, u
             m_lastNeed = nb - 2;
         return nb;
     }
-    if (--j < i || nb == -2)
+    if (j == 0 || --j < i || nb == -2)
         return 0;
     nb = utf8CheckByte(bufPtr[j]);
     if (nb >= 0) {
diff --git a/src/bun.js/bindings/ModuleLoader.cpp b/src/bun.js/bindings/ModuleLoader.cpp
index ed1e5702b..0ccbb7dbb 100644
--- a/src/bun.js/bindings/ModuleLoader.cpp
+++ b/src/bun.js/bindings/ModuleLoader.cpp
@@ -36,6 +36,11 @@
 #include "../modules/TTYModule.h"
 #include "node_util_types.h"
 #include "CommonJSModuleRecord.h"
+#include <JavaScriptCore/JSModuleLoader.h>
+#include <JavaScriptCore/Completion.h>
+#include <JavaScriptCore/JSModuleNamespaceObject.h>
+#include <JavaScriptCore/JSMap.h>
+#include <JavaScriptCore/JSMapInlines.h>
 
 namespace Bun {
 using namespace Zig;
@@ -350,6 +355,110 @@ extern "C" void Bun__onFulfillAsyncModule(
     promise->resolve(promise->globalObject(), JSC::JSSourceCode::create(vm, JSC::SourceCode(provider)));
 }
 
+JSValue fetchCommonJSModule(
+    Zig::GlobalObject* globalObject,
+    JSCommonJSModule* target,
+    JSValue specifierValue,
+    BunString* specifier,
+    BunString* referrer)
+{
+    void* bunVM = globalObject->bunVM();
+    auto& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    ErrorableResolvedSource resValue;
+    ErrorableResolvedSource* res = &resValue;
+
+    auto& builtinNames = WebCore::clientData(vm)->builtinNames();
+
+    if (Bun__fetchBuiltinModule(bunVM, globalObject, specifier, referrer, res)) {
+        if (!res->success) {
+            throwException(scope, res->result.err, globalObject);
+            return JSValue();
+        }
+
+        switch (res->result.value.tag) {
+        case SyntheticModuleType::Module: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateNodeModuleModule);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+
+        case SyntheticModuleType::Buffer: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateBufferSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::TTY: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateTTYSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::NodeUtilTypes: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), Bun::generateNodeUtilTypesSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::Process: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateProcessSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::Events: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateEventsSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::StringDecoder: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateStringDecoderSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        default: {
+            RELEASE_AND_RETURN(scope, jsNumber(-1));
+        }
+        }
+    }
+
+    // if (JSC::JSValue virtualModuleResult = JSValue::decode(Bun__runVirtualModule(globalObject, specifier))) {
+    //     return handleVirtualModuleResult<allowPromise>(globalObject, virtualModuleResult, res, specifier, referrer);
+    // }
+    auto* loader = globalObject->moduleLoader();
+    JSMap* registry = jsCast<JSMap*>(loader->getDirect(vm, Identifier::fromString(vm, "registry"_s)));
+
+    auto hasAlreadyLoadedESMVersionSoWeShouldntTranspileItTwice = [&]() -> bool {
+        JSValue entry = registry->get(globalObject, specifierValue);
+
+        if (!entry || !entry.isObject()) {
+            return false;
+        }
+
+        int status = entry.getObject()->getDirect(vm, WebCore::clientData(vm)->builtinNames().statePublicName()).asInt32();
+        return status > JSModuleLoader::Status::Fetch;
+    };
+
+    if (hasAlreadyLoadedESMVersionSoWeShouldntTranspileItTwice()) {
+        RELEASE_AND_RETURN(scope, jsNumber(-1));
+    }
+
+    Bun__transpileFile(bunVM, globalObject, specifier, referrer, res, false);
+
+    if (res->success && res->result.value.commonJSExportsLen) {
+        target->evaluate(globalObject, Bun::toWTFString(*specifier).isolatedCopy(), res->result.value);
+        RETURN_IF_EXCEPTION(scope, {});
+        RELEASE_AND_RETURN(scope, target);
+    }
+
+    if (!res->success) {
+        throwException(scope, res->result.err, globalObject);
+        RELEASE_AND_RETURN(scope, {});
+    }
+
+    auto&& provider = Zig::SourceProvider::create(globalObject, res->result.value);
+    globalObject->moduleLoader()->provideFetch(globalObject, specifierValue, JSC::SourceCode(provider));
+    RETURN_IF_EXCEPTION(scope, {});
+    RELEASE_AND_RETURN(scope, jsNumber(-1));
+}
+
 template<bool allowPromise>
 static JSValue fetchSourceCode(
     Zig::GlobalObject* globalObject,
@@ -382,6 +491,11 @@ static JSValue fetchSourceCode(
 
     auto rejectOrResolve = [&](JSValue code) -> JSValue {
         if (auto* exception = scope.exception()) {
+            if constexpr (!allowPromise) {
+                scope.release();
+                return {};
+            }
+
             scope.clearException();
             return rejectedInternalPromise(globalObject, exception);
         }
@@ -457,7 +571,7 @@ static JSValue fetchSourceCode(
             return rejectOrResolve(JSSourceCode::create(vm, WTFMove(source)));
         }
         default: {
-            auto&& provider = Zig::SourceProvider::create(globalObject, res->result.value);
+            auto&& provider = Zig::SourceProvider::create(globalObject, res->result.value, JSC::SourceProviderSourceType::Module, true);
             return rejectOrResolve(JSC::JSSourceCode::create(vm, JSC::SourceCode(provider)));
         }
         }
@@ -477,8 +591,19 @@ static JSValue fetchSourceCode(
     }
 
     if (res->success && res->result.value.commonJSExportsLen) {
-        auto source = Bun::createCommonJSModule(globalObject, res->result.value);
-        return rejectOrResolve(JSSourceCode::create(vm, WTFMove(source)));
+        auto created = Bun::createCommonJSModule(globalObject, res->result.value);
+
+        if (created.has_value()) {
+            return rejectOrResolve(JSSourceCode::create(vm, WTFMove(created.value())));
+        }
+
+        if constexpr (allowPromise) {
+            auto* exception = scope.exception();
+            scope.clearException();
+            return rejectedInternalPromise(globalObject, exception);
+        } else {
+            return JSC::jsUndefined();
+        }
     }
 
     if (!res->success) {
diff --git a/src/bun.js/bindings/ModuleLoader.h b/src/bun.js/bindings/ModuleLoader.h
index 0deaeff08..6eb04bf40 100644
--- a/src/bun.js/bindings/ModuleLoader.h
+++ b/src/bun.js/bindings/ModuleLoader.h
@@ -15,6 +15,8 @@ class JSInternalPromise;
 namespace Bun {
 using namespace JSC;
 
+class JSCommonJSModule;
+
 typedef uint8_t OnLoadResultType;
 const OnLoadResultType OnLoadResultTypeError = 0;
 const OnLoadResultType OnLoadResultTypeCode = 1;
@@ -91,4 +93,11 @@ JSValue fetchSourceCodeAsync(
     BunString* specifier,
     BunString* referrer);
 
+JSValue fetchCommonJSModule(
+    Zig::GlobalObject* globalObject,
+    JSCommonJSModule* moduleObject,
+    JSValue specifierValue,
+    BunString* specifier,
+    BunString* referrer);
+
 } // namespace Bun
 \ No newline at end of file
diff --git a/src/bun.js/bindings/Process.cpp b/src/bun.js/bindings/Process.cpp
index 69ee11e60..745be0e47 100644
--- a/src/bun.js/bindings/Process.cpp
+++ b/src/bun.js/bindings/Process.cpp
@@ -10,13 +10,41 @@
 #include "ImportMetaObject.h"
 #include <sys/stat.h>
 #include "ZigConsoleClient.h"
+#include <JavaScriptCore/GetterSetter.h>
+#include <JavaScriptCore/JSSet.h>
+#include <JavaScriptCore/LazyProperty.h>
+#include <JavaScriptCore/LazyPropertyInlines.h>
+#include <JavaScriptCore/VMTrapsInlines.h>
+
 #pragma mark - Node.js Process
 
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__linux__)
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#endif
+
+#if !defined(_MSC_VER)
+#include <unistd.h> // setuid, getuid
+#endif
+
 namespace Zig {
 
 using namespace JSC;
 
 #define REPORTED_NODE_VERSION "18.15.0"
+#define processObjectBindingCodeGenerator processObjectInternalsBindingCodeGenerator
+#define processObjectMainModuleCodeGenerator moduleMainCodeGenerator
+
+#if !defined(BUN_WEBKIT_VERSION)
+#define BUN_WEBKIT_VERSION "unknown"
+#endif
 
 using JSGlobalObject = JSC::JSGlobalObject;
 using Exception = JSC::Exception;
@@ -30,117 +58,42 @@ using JSObject = JSC::JSObject;
 using JSNonFinalObject = JSC::JSNonFinalObject;
 namespace JSCastingHelpers = JSC::JSCastingHelpers;
 
-static JSC_DECLARE_CUSTOM_SETTER(Process_setTitle);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getArgv);
-static JSC_DECLARE_CUSTOM_SETTER(Process_setArgv);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getTitle);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getVersionsLazy);
-static JSC_DECLARE_CUSTOM_SETTER(Process_setVersionsLazy);
-
-static JSC_DECLARE_CUSTOM_GETTER(Process_getPID);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getPPID);
-
-static JSC_DECLARE_HOST_FUNCTION(Process_functionCwd);
+JSC_DECLARE_CUSTOM_SETTER(Process_setTitle);
+JSC_DECLARE_CUSTOM_GETTER(Process_getArgv);
+JSC_DECLARE_CUSTOM_SETTER(Process_setArgv);
+JSC_DECLARE_CUSTOM_GETTER(Process_getTitle);
+JSC_DECLARE_CUSTOM_GETTER(Process_getPID);
+JSC_DECLARE_CUSTOM_GETTER(Process_getPPID);
+JSC_DECLARE_HOST_FUNCTION(Process_functionCwd);
+static bool processIsExiting = false;
+
+extern "C" uint8_t Bun__getExitCode(void*);
+extern "C" uint8_t Bun__setExitCode(void*, uint8_t);
+extern "C" void* Bun__getVM();
+extern "C" Zig::GlobalObject* Bun__getDefaultGlobal();
+extern "C" const char* Bun__githubURL;
 
-static JSValue constructStdioWriteStream(JSC::JSGlobalObject* globalObject, int fd)
+static void dispatchExitInternal(JSC::JSGlobalObject* globalObject, Process* process, int exitCode)
 {
-    auto& vm = globalObject->vm();
-    auto scope = DECLARE_THROW_SCOPE(vm);
-    auto* thisObject = reinterpret_cast<Zig::GlobalObject*>(globalObject);
-    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdioWriteStreamCodeGenerator(vm), globalObject);
-    JSC::MarkedArgumentBuffer args;
-    WTF::String process = WTF::String("node:process"_s);
-    JSC::JSValue requireFunction = Zig::ImportMetaObject::createRequireFunction(
-        vm,
-        globalObject,
-        process);
-
-    args.append(JSC::jsNumber(fd));
-    args.append(requireFunction);
-
-    auto clientData = WebCore::clientData(vm);
-    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
-
-    NakedPtr<JSC::Exception> returnedException = nullptr;
-    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject->globalThis(), args, returnedException);
-    RETURN_IF_EXCEPTION(scope, {});
-
-    if (returnedException) {
-        throwException(globalObject, scope, returnedException.get());
-        return {};
-    }
-
-    return result;
-}
 
-JSC_DEFINE_CUSTOM_GETTER(
-    Process_lazyStdinGetter,
-    (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName property))
-{
+    if (processIsExiting)
+        return;
+    processIsExiting = true;
+    auto& emitter = process->wrapped();
     auto& vm = globalObject->vm();
-    auto scope = DECLARE_THROW_SCOPE(vm);
-    JSC::JSValue value = JSC::JSValue::decode(thisValue);
-    if (!value || value.isUndefinedOrNull() || !value.isObject())
-        return JSValue::encode(jsUndefined());
-
-    auto* thisObject = reinterpret_cast<Zig::GlobalObject*>(globalObject);
-    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdinStreamCodeGenerator(vm), globalObject);
-    JSC::MarkedArgumentBuffer args;
-    WTF::String process = WTF::String("node:process"_s);
-    JSC::JSValue requireFunction = Zig::ImportMetaObject::createRequireFunction(
-        vm,
-        globalObject,
-        process);
-
-    args.append(JSC::jsNumber(STDIN_FILENO));
-    args.append(requireFunction);
-    args.append(thisObject->get(globalObject, PropertyName(JSC::Identifier::fromString(vm, "Bun"_s))));
 
-    auto clientData = WebCore::clientData(vm);
-    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
-
-    NakedPtr<JSC::Exception> returnedException = nullptr;
-    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject->globalThis(), args, returnedException);
-    RETURN_IF_EXCEPTION(scope, {});
+    if (vm.hasTerminationRequest() || vm.hasExceptionsAfterHandlingTraps())
+        return;
 
-    if (UNLIKELY(returnedException)) {
-        throwException(globalObject, scope, returnedException.get());
-        return {};
+    auto event = Identifier::fromString(vm, "exit"_s);
+    if (!emitter.hasEventListeners(event)) {
+        return;
     }
+    process->putDirect(vm, Identifier::fromString(vm, "_exiting"_s), jsBoolean(true), 0);
 
-    if (LIKELY(result))
-        value.getObject()->putDirect(vm, property, result, 0);
-
-    return JSValue::encode(result);
-}
-
-JSC_DEFINE_CUSTOM_GETTER(
-    Process_lazyStdoutGetter,
-    (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName property))
-{
-    JSValue value = JSValue::decode(thisValue);
-    auto& vm = globalObject->vm();
-    JSC::JSObject* thisObject = value.toObject(globalObject);
-    JSC::JSValue stream = constructStdioWriteStream(globalObject, 1);
-
-    if (stream)
-        thisObject->putDirect(vm, property, stream, 0);
-
-    return JSValue::encode(stream);
-}
-
-JSC_DEFINE_CUSTOM_GETTER(
-    Process_lazyStderrGetter, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName property))
-{
-    JSValue value = JSValue::decode(thisValue);
-    auto& vm = globalObject->vm();
-    JSC::JSObject* thisObject = value.toObject(globalObject);
-    JSC::JSValue stream = constructStdioWriteStream(globalObject, 2);
-
-    if (stream)
-        thisObject->putDirect(vm, property, stream, 0);
-
-    return JSValue::encode(stream);
+    MarkedArgumentBuffer arguments;
+    arguments.append(jsNumber(exitCode));
+    emitter.emit(event, arguments);
 }
 
 JSC_DEFINE_CUSTOM_SETTER(Process_defaultSetter,
@@ -323,6 +276,29 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionUmask,
 extern "C" uint64_t Bun__readOriginTimer(void*);
 extern "C" double Bun__readOriginTimerStart(void*);
 
+// https://github.com/nodejs/node/blob/1936160c31afc9780e4365de033789f39b7cbc0c/src/api/hooks.cc#L49
+extern "C" void Process__dispatchOnBeforeExit(Zig::GlobalObject* globalObject, uint8_t exitCode)
+{
+    if (!globalObject->hasProcessObject()) {
+        return;
+    }
+
+    auto* process = jsCast<Process*>(globalObject->processObject());
+    MarkedArgumentBuffer arguments;
+    arguments.append(jsNumber(exitCode));
+    process->wrapped().emit(Identifier::fromString(globalObject->vm(), "beforeExit"_s), arguments);
+}
+
+extern "C" void Process__dispatchOnExit(Zig::GlobalObject* globalObject, uint8_t exitCode)
+{
+    if (!globalObject->hasProcessObject()) {
+        return;
+    }
+
+    auto* process = jsCast<Process*>(globalObject->processObject());
+    dispatchExitInternal(globalObject, process, exitCode);
+}
+
 JSC_DEFINE_HOST_FUNCTION(Process_functionUptime,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
@@ -335,14 +311,39 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionUptime,
 JSC_DEFINE_HOST_FUNCTION(Process_functionExit,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
-    if (callFrame->argumentCount() == 0) {
-        // TODO: exitCode
-        Bun__Process__exit(globalObject, 0);
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
+    uint8_t exitCode = 0;
+    JSValue arg0 = callFrame->argument(0);
+    if (arg0.isNumber()) {
+        if (!arg0.isInt32()) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        int extiCode32 = arg0.toInt32(globalObject);
+        RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
+
+        if (extiCode32 < 0 || extiCode32 > 127) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer between 0 and 127"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        exitCode = static_cast<uint8_t>(extiCode32);
+    } else if (!arg0.isUndefinedOrNull()) {
+        throwTypeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+        return JSC::JSValue::encode(JSC::JSValue {});
     } else {
-        Bun__Process__exit(globalObject, callFrame->argument(0).toInt32(globalObject));
+        exitCode = Bun__getExitCode(Bun__getVM());
+    }
+
+    auto* zigGlobal = jsDynamicCast<Zig::GlobalObject*>(globalObject);
+    if (UNLIKELY(!zigGlobal)) {
+        zigGlobal = Bun__getDefaultGlobal();
     }
 
-    return JSC::JSValue::encode(JSC::jsUndefined());
+    Process__dispatchOnExit(zigGlobal, exitCode);
+    Bun__Process__exit(zigGlobal, exitCode);
+    __builtin_unreachable();
 }
 
 extern "C" uint64_t Bun__readOriginTimer(void*);
@@ -350,9 +351,12 @@ extern "C" uint64_t Bun__readOriginTimer(void*);
 JSC_DEFINE_HOST_FUNCTION(Process_functionHRTime,
     (JSC::JSGlobalObject * globalObject_, JSC::CallFrame* callFrame))
 {
+
     Zig::GlobalObject* globalObject
         = reinterpret_cast<Zig::GlobalObject*>(globalObject_);
     auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
     uint64_t time = Bun__readOriginTimer(globalObject->bunVM());
     int64_t seconds = static_cast<int64_t>(time / 1000000000);
     int64_t nanoseconds = time % 1000000000;
@@ -361,7 +365,6 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionHRTime,
         JSC::JSValue arg0 = callFrame->uncheckedArgument(0);
         if (!arg0.isUndefinedOrNull()) {
             JSArray* relativeArray = JSC::jsDynamicCast<JSC::JSArray*>(arg0);
-            auto throwScope = DECLARE_THROW_SCOPE(vm);
             if ((!relativeArray && !arg0.isUndefinedOrNull()) || relativeArray->length() < 2) {
                 JSC::throwTypeError(globalObject, throwScope, "hrtime() argument must be an array or undefined"_s);
                 return JSC::JSValue::encode(JSC::JSValue {});
@@ -381,27 +384,38 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionHRTime,
                 seconds--;
                 nanoseconds += 1000000000;
             }
-            throwScope.release();
         }
     }
 
-    auto* array = JSArray::create(vm, globalObject->originalArrayStructureForIndexingType(ArrayWithContiguous), 2);
-    array->setIndexQuickly(vm, 0, JSC::jsNumber(seconds));
-    array->setIndexQuickly(vm, 1, JSC::jsNumber(nanoseconds));
-    return JSC::JSValue::encode(JSC::JSValue(array));
+    JSC::JSArray* array = nullptr;
+    {
+        JSC::ObjectInitializationScope initializationScope(vm);
+        if ((array = JSC::JSArray::tryCreateUninitializedRestricted(
+                 initializationScope, nullptr,
+                 globalObject->arrayStructureForIndexingTypeDuringAllocation(JSC::ArrayWithContiguous),
+                 2))) {
+
+            array->initializeIndex(initializationScope, 0, JSC::jsNumber(seconds));
+            array->initializeIndex(initializationScope, 1, JSC::jsNumber(nanoseconds));
+        }
+    }
+
+    if (UNLIKELY(!array)) {
+        JSC::throwOutOfMemoryError(globalObject, throwScope);
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
+
+    RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(array));
 }
-static JSC_DECLARE_HOST_FUNCTION(Process_functionHRTimeBigInt);
 
-static JSC_DEFINE_HOST_FUNCTION(Process_functionHRTimeBigInt,
+JSC_DEFINE_HOST_FUNCTION(Process_functionHRTimeBigInt,
     (JSC::JSGlobalObject * globalObject_, JSC::CallFrame* callFrame))
 {
     Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(globalObject_);
     return JSC::JSValue::encode(JSValue(JSC::JSBigInt::createFrom(globalObject, Bun__readOriginTimer(globalObject->bunVM()))));
 }
 
-static JSC_DECLARE_HOST_FUNCTION(Process_functionChdir);
-
-static JSC_DEFINE_HOST_FUNCTION(Process_functionChdir,
+JSC_DEFINE_HOST_FUNCTION(Process_functionChdir,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
@@ -423,119 +437,220 @@ static JSC_DEFINE_HOST_FUNCTION(Process_functionChdir,
     return JSC::JSValue::encode(result);
 }
 
-extern "C" const char* Bun__githubURL;
-
-JSC_DEFINE_CUSTOM_GETTER(Process_getterRelease, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName))
+static HashMap<String, int>* signalNameToNumberMap = nullptr;
+static HashMap<int, String>* signalNumberToNameMap = nullptr;
+
+// signal number to array of script execution context ids that care about the signal
+static HashMap<int, HashSet<uint32_t>>* signalToContextIdsMap = nullptr;
+static Lock signalToContextIdsMapLock;
+
+static const NeverDestroyed<String> signalNames[] = {
+    MAKE_STATIC_STRING_IMPL("SIGHUP"),
+    MAKE_STATIC_STRING_IMPL("SIGINT"),
+    MAKE_STATIC_STRING_IMPL("SIGQUIT"),
+    MAKE_STATIC_STRING_IMPL("SIGILL"),
+    MAKE_STATIC_STRING_IMPL("SIGTRAP"),
+    MAKE_STATIC_STRING_IMPL("SIGABRT"),
+    MAKE_STATIC_STRING_IMPL("SIGIOT"),
+    MAKE_STATIC_STRING_IMPL("SIGBUS"),
+    MAKE_STATIC_STRING_IMPL("SIGFPE"),
+    MAKE_STATIC_STRING_IMPL("SIGKILL"),
+    MAKE_STATIC_STRING_IMPL("SIGUSR1"),
+    MAKE_STATIC_STRING_IMPL("SIGSEGV"),
+    MAKE_STATIC_STRING_IMPL("SIGUSR2"),
+    MAKE_STATIC_STRING_IMPL("SIGPIPE"),
+    MAKE_STATIC_STRING_IMPL("SIGALRM"),
+    MAKE_STATIC_STRING_IMPL("SIGTERM"),
+    MAKE_STATIC_STRING_IMPL("SIGCHLD"),
+    MAKE_STATIC_STRING_IMPL("SIGCONT"),
+    MAKE_STATIC_STRING_IMPL("SIGSTOP"),
+    MAKE_STATIC_STRING_IMPL("SIGTSTP"),
+    MAKE_STATIC_STRING_IMPL("SIGTTIN"),
+    MAKE_STATIC_STRING_IMPL("SIGTTOU"),
+    MAKE_STATIC_STRING_IMPL("SIGURG"),
+    MAKE_STATIC_STRING_IMPL("SIGXCPU"),
+    MAKE_STATIC_STRING_IMPL("SIGXFSZ"),
+    MAKE_STATIC_STRING_IMPL("SIGVTALRM"),
+    MAKE_STATIC_STRING_IMPL("SIGPROF"),
+    MAKE_STATIC_STRING_IMPL("SIGWINCH"),
+    MAKE_STATIC_STRING_IMPL("SIGIO"),
+    MAKE_STATIC_STRING_IMPL("SIGINFO"),
+    MAKE_STATIC_STRING_IMPL("SIGSYS"),
+};
+
+static void loadSignalNumberMap()
 {
-    auto& vm = globalObject->vm();
 
-    auto* release = JSC::constructEmptyObject(globalObject);
-    release->putDirect(vm, Identifier::fromString(vm, "name"_s), jsString(vm, WTF::String("bun"_s)), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "lts"_s), jsBoolean(false), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "sourceUrl"_s), jsString(vm, WTF::String(Bun__githubURL, strlen(Bun__githubURL))), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "headersUrl"_s), jsEmptyString(vm), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "libUrl"_s), jsEmptyString(vm), 0);
+    static std::once_flag signalNameToNumberMapOnceFlag;
+    std::call_once(signalNameToNumberMapOnceFlag, [] {
+        signalNameToNumberMap = new HashMap<String, int>();
+        signalNameToNumberMap->reserveInitialCapacity(31);
+        signalNameToNumberMap->add(signalNames[0], SIGHUP);
+        signalNameToNumberMap->add(signalNames[1], SIGINT);
+        signalNameToNumberMap->add(signalNames[2], SIGQUIT);
+        signalNameToNumberMap->add(signalNames[3], SIGILL);
+        signalNameToNumberMap->add(signalNames[4], SIGTRAP);
+        signalNameToNumberMap->add(signalNames[5], SIGABRT);
+        signalNameToNumberMap->add(signalNames[6], SIGIOT);
+        signalNameToNumberMap->add(signalNames[7], SIGBUS);
+        signalNameToNumberMap->add(signalNames[8], SIGFPE);
+        // signalNameToNumberMap->add(signalNames[9], SIGKILL);
+        signalNameToNumberMap->add(signalNames[10], SIGUSR1);
+        signalNameToNumberMap->add(signalNames[11], SIGSEGV);
+        signalNameToNumberMap->add(signalNames[12], SIGUSR2);
+        signalNameToNumberMap->add(signalNames[13], SIGPIPE);
+        signalNameToNumberMap->add(signalNames[14], SIGALRM);
+        signalNameToNumberMap->add(signalNames[15], SIGTERM);
+        signalNameToNumberMap->add(signalNames[16], SIGCHLD);
+        signalNameToNumberMap->add(signalNames[17], SIGCONT);
+        // signalNameToNumberMap->add(signalNames[18], SIGSTOP);
+        signalNameToNumberMap->add(signalNames[19], SIGTSTP);
+        signalNameToNumberMap->add(signalNames[20], SIGTTIN);
+        signalNameToNumberMap->add(signalNames[21], SIGTTOU);
+        signalNameToNumberMap->add(signalNames[22], SIGURG);
+        signalNameToNumberMap->add(signalNames[23], SIGXCPU);
+        signalNameToNumberMap->add(signalNames[24], SIGXFSZ);
+        signalNameToNumberMap->add(signalNames[25], SIGVTALRM);
+        signalNameToNumberMap->add(signalNames[26], SIGPROF);
+        signalNameToNumberMap->add(signalNames[27], SIGWINCH);
+        signalNameToNumberMap->add(signalNames[28], SIGIO);
+#ifdef SIGINFO
+        signalNameToNumberMap->add(signalNames[29], SIGINFO);
+#endif
 
-    return JSValue::encode(release);
+#ifndef SIGINFO
+        signalNameToNumberMap->add(signalNames[29], 255);
+#endif
+        signalNameToNumberMap->add(signalNames[30], SIGSYS);
+    });
 }
 
-JSC_DEFINE_CUSTOM_SETTER(Process_setterRelease,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName))
+static void onDidChangeListeners(EventEmitter& eventEmitter, const Identifier& eventName, bool isAdded)
 {
-    JSC::VM& vm = globalObject->vm();
+    loadSignalNumberMap();
+
+    static std::once_flag signalNumberToNameMapOnceFlag;
+    std::call_once(signalNumberToNameMapOnceFlag, [] {
+        signalNumberToNameMap = new HashMap<int, String>();
+        signalNumberToNameMap->reserveInitialCapacity(31);
+        signalNumberToNameMap->add(SIGHUP, signalNames[0]);
+        signalNumberToNameMap->add(SIGINT, signalNames[1]);
+        signalNumberToNameMap->add(SIGQUIT, signalNames[2]);
+        signalNumberToNameMap->add(SIGILL, signalNames[3]);
+        signalNumberToNameMap->add(SIGTRAP, signalNames[4]);
+        signalNumberToNameMap->add(SIGABRT, signalNames[5]);
+        signalNumberToNameMap->add(SIGIOT, signalNames[6]);
+        signalNumberToNameMap->add(SIGBUS, signalNames[7]);
+        signalNumberToNameMap->add(SIGFPE, signalNames[8]);
+        // signalNumberToNameMap->add(SIGKILL, signalNames[9]);
+        signalNumberToNameMap->add(SIGUSR1, signalNames[10]);
+        signalNumberToNameMap->add(SIGSEGV, signalNames[11]);
+        signalNumberToNameMap->add(SIGUSR2, signalNames[12]);
+        signalNumberToNameMap->add(SIGPIPE, signalNames[13]);
+        signalNumberToNameMap->add(SIGALRM, signalNames[14]);
+        signalNumberToNameMap->add(SIGTERM, signalNames[15]);
+        signalNumberToNameMap->add(SIGCHLD, signalNames[16]);
+        signalNumberToNameMap->add(SIGCONT, signalNames[17]);
+        // signalNumberToNameMap->add(SIGSTOP, signalNames[18]);
+        signalNumberToNameMap->add(SIGTSTP, signalNames[19]);
+        signalNumberToNameMap->add(SIGTTIN, signalNames[20]);
+        signalNumberToNameMap->add(SIGTTOU, signalNames[21]);
+        signalNumberToNameMap->add(SIGURG, signalNames[22]);
+        signalNumberToNameMap->add(SIGXCPU, signalNames[23]);
+        signalNumberToNameMap->add(SIGXFSZ, signalNames[24]);
+        signalNumberToNameMap->add(SIGVTALRM, signalNames[25]);
+        signalNumberToNameMap->add(SIGPROF, signalNames[26]);
+        signalNumberToNameMap->add(SIGWINCH, signalNames[27]);
+        signalNumberToNameMap->add(SIGIO, signalNames[28]);
+#ifdef SIGINFO
+        signalNameToNumberMap->add(signalNames[29], SIGINFO);
+#endif
+        signalNumberToNameMap->add(SIGSYS, signalNames[30]);
+    });
 
-    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
-    thisObject->putDirect(vm, JSC::Identifier::fromString(vm, "release"_s), JSValue::decode(value), 0);
+    if (!signalToContextIdsMap) {
+        signalToContextIdsMap = new HashMap<int, HashSet<uint32_t>>();
+    }
 
-    return true;
+    if (isAdded) {
+        if (auto signalNumber = signalNameToNumberMap->get(eventName.string())) {
+            uint32_t contextId = eventEmitter.scriptExecutionContext()->identifier();
+            Locker lock { signalToContextIdsMapLock };
+            if (!signalToContextIdsMap->contains(signalNumber)) {
+                HashSet<uint32_t> contextIds;
+                contextIds.add(contextId);
+                signalToContextIdsMap->set(signalNumber, contextIds);
+
+                lock.unlockEarly();
+
+                struct sigaction action;
+                memset(&action, 0, sizeof(struct sigaction));
+
+                // Set the handler in the action struct
+                action.sa_handler = [](int signalNumber) {
+                    if (UNLIKELY(signalNumberToNameMap->find(signalNumber) == signalNumberToNameMap->end()))
+                        return;
+
+                    Locker lock { signalToContextIdsMapLock };
+                    if (UNLIKELY(signalToContextIdsMap->find(signalNumber) == signalToContextIdsMap->end()))
+                        return;
+                    auto contextIds = signalToContextIdsMap->get(signalNumber);
+
+                    for (int contextId : contextIds) {
+                        auto* context = ScriptExecutionContext::getScriptExecutionContext(contextId);
+                        if (UNLIKELY(!context))
+                            continue;
+
+                        JSGlobalObject* lexicalGlobalObject = context->jsGlobalObject();
+                        Zig::GlobalObject* globalObject = static_cast<Zig::GlobalObject*>(lexicalGlobalObject);
+
+                        Process* process = jsCast<Process*>(globalObject->processObject());
+
+                        context->postCrossThreadTask(*process, &Process::emitSignalEvent, signalNumber);
+                    }
+                };
+
+                // Clear the sa_mask
+                sigemptyset(&action.sa_mask);
+                sigaddset(&action.sa_mask, signalNumber);
+                action.sa_flags = SA_RESTART;
+
+                sigaction(signalNumber, &action, nullptr);
+            } else {
+                auto contextIds = signalToContextIdsMap->get(signalNumber);
+                contextIds.add(contextId);
+                signalToContextIdsMap->set(signalNumber, contextIds);
+            }
+        }
+    } else {
+        if (auto signalNumber = signalNameToNumberMap->get(eventName.string())) {
+            uint32_t contextId = eventEmitter.scriptExecutionContext()->identifier();
+            Locker lock { signalToContextIdsMapLock };
+            if (signalToContextIdsMap->find(signalNumber) != signalToContextIdsMap->end()) {
+                HashSet<uint32_t> contextIds = signalToContextIdsMap->get(signalNumber);
+                contextIds.remove(contextId);
+                if (contextIds.isEmpty()) {
+                    signal(signalNumber, SIG_DFL);
+                    signalToContextIdsMap->remove(signalNumber);
+                } else {
+                    signalToContextIdsMap->set(signalNumber, contextIds);
+                }
+            }
+        }
+    }
 }
 
-// static const NeverDestroyed<String> signalNames[] = {
-//     MAKE_STATIC_STRING_IMPL("SIGHUP"),
-//     MAKE_STATIC_STRING_IMPL("SIGINT"),
-//     MAKE_STATIC_STRING_IMPL("SIGQUIT"),
-//     MAKE_STATIC_STRING_IMPL("SIGILL"),
-//     MAKE_STATIC_STRING_IMPL("SIGTRAP"),
-//     MAKE_STATIC_STRING_IMPL("SIGABRT"),
-//     MAKE_STATIC_STRING_IMPL("SIGIOT"),
-//     MAKE_STATIC_STRING_IMPL("SIGBUS"),
-//     MAKE_STATIC_STRING_IMPL("SIGFPE"),
-//     MAKE_STATIC_STRING_IMPL("SIGKILL"),
-//     MAKE_STATIC_STRING_IMPL("SIGUSR1"),
-//     MAKE_STATIC_STRING_IMPL("SIGSEGV"),
-//     MAKE_STATIC_STRING_IMPL("SIGUSR2"),
-//     MAKE_STATIC_STRING_IMPL("SIGPIPE"),
-//     MAKE_STATIC_STRING_IMPL("SIGALRM"),
-//     MAKE_STATIC_STRING_IMPL("SIGTERM"),
-//     MAKE_STATIC_STRING_IMPL("SIGCHLD"),
-//     MAKE_STATIC_STRING_IMPL("SIGCONT"),
-//     MAKE_STATIC_STRING_IMPL("SIGSTOP"),
-//     MAKE_STATIC_STRING_IMPL("SIGTSTP"),
-//     MAKE_STATIC_STRING_IMPL("SIGTTIN"),
-//     MAKE_STATIC_STRING_IMPL("SIGTTOU"),
-//     MAKE_STATIC_STRING_IMPL("SIGURG"),
-//     MAKE_STATIC_STRING_IMPL("SIGXCPU"),
-//     MAKE_STATIC_STRING_IMPL("SIGXFSZ"),
-//     MAKE_STATIC_STRING_IMPL("SIGVTALRM"),
-//     MAKE_STATIC_STRING_IMPL("SIGPROF"),
-//     MAKE_STATIC_STRING_IMPL("SIGWINCH"),
-//     MAKE_STATIC_STRING_IMPL("SIGIO"),
-//     MAKE_STATIC_STRING_IMPL("SIGINFO"),
-//     MAKE_STATIC_STRING_IMPL("SIGSYS"),
-// };
-// static const int signalNumbers[] = {
-//     SIGHUP,
-//     SIGINT,
-//     SIGQUIT,
-//     SIGILL,
-//     SIGTRAP,
-//     SIGABRT,
-//     SIGIOT,
-//     SIGBUS,
-//     SIGFPE,
-//     SIGKILL,
-//     SIGUSR1,
-//     SIGSEGV,
-//     SIGUSR2,
-//     SIGPIPE,
-//     SIGALRM,
-//     SIGTERM,
-//     SIGCHLD,
-//     SIGCONT,
-//     SIGSTOP,
-//     SIGTSTP,
-//     SIGTTIN,
-//     SIGTTOU,
-//     SIGURG,
-//     SIGXCPU,
-//     SIGXFSZ,
-//     SIGVTALRM,
-//     SIGPROF,
-//     SIGWINCH,
-//     SIGIO,
-//     SIGINFO,
-//     SIGSYS,
-// };
-
-// JSC_DEFINE_HOST_FUNCTION(jsFunctionProcessOn, (JSGlobalObject * globalObject, CallFrame* callFrame))
-// {
-//     VM& vm = globalObject->vm();
-//     auto scope = DECLARE_THROW_SCOPE(vm);
-
-//     if (callFrame->argumentCount() < 2) {
-//         throwVMError(globalObject, scope, "Not enough arguments"_s);
-//         return JSValue::encode(jsUndefined());
-//     }
-
-//     String eventName = callFrame->uncheckedArgument(0).toWTFString(globalObject);
-//     RETURN_IF_EXCEPTION(scope, encodedJSValue());
-// }
+void Process::emitSignalEvent(int signalNumber)
+{
+    String signalName = signalNumberToNameMap->get(signalNumber);
+    Identifier signalNameIdentifier = Identifier::fromString(vm(), signalName);
+    MarkedArgumentBuffer args;
+    args.append(jsNumber(signalNumber));
+    wrapped().emitForBindings(signalNameIdentifier, args);
+}
 
 Process::~Process()
 {
-    for (auto& listener : this->wrapped().eventListenerMap().entries()) {
-    }
 }
 
 JSC_DEFINE_HOST_FUNCTION(Process_functionAbort, (JSGlobalObject * globalObject, CallFrame*))
@@ -559,201 +674,123 @@ JSC_DEFINE_HOST_FUNCTION(Process_emitWarning, (JSGlobalObject * lexicalGlobalObj
 
     auto* process = jsCast<Process*>(globalObject->processObject());
 
-    auto getError = [&]() -> JSValue {
+    JSObject* errorInstance = ([&]() -> JSObject* {
         JSValue arg0 = callFrame->uncheckedArgument(0);
         if (!arg0.isEmpty() && arg0.isCell() && arg0.asCell()->type() == ErrorInstanceType) {
-            return arg0;
+            return arg0.getObject();
         }
 
         WTF::String str = arg0.toWTFString(globalObject);
         return createError(globalObject, str);
-    };
+    })();
+
+    errorInstance->putDirect(vm, Identifier::fromString(vm, "name"_s), jsString(vm, String("warn"_s)), JSC::PropertyAttribute::DontEnum | 0);
 
     auto ident = Identifier::fromString(vm, "warning"_s);
     if (process->wrapped().hasEventListeners(ident)) {
         JSC::MarkedArgumentBuffer args;
-        args.append(getError());
+        args.append(errorInstance);
 
         process->wrapped().emit(ident, args);
         return JSValue::encode(jsUndefined());
     }
 
-    auto jsArgs = JSValue::encode(getError());
+    auto jsArgs = JSValue::encode(errorInstance);
     Zig__ConsoleClient__messageWithTypeAndLevel(reinterpret_cast<Zig::ConsoleClient*>(globalObject->consoleClient().get())->m_client, static_cast<uint32_t>(MessageType::Log),
         static_cast<uint32_t>(MessageLevel::Warning), globalObject, &jsArgs, 1);
     return JSValue::encode(jsUndefined());
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_lazyArgv0Getter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
+JSC_DEFINE_CUSTOM_GETTER(processExitCode, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
 {
-    JSC::JSObject* thisObject = JSValue::decode(thisValue).getObject();
-    EncodedJSValue ret = Bun__Process__getArgv0(globalObject);
-
-    if (LIKELY(thisObject)) {
-        thisObject->putDirect(globalObject->vm(), name, JSValue::decode(ret), 0);
+    Process* process = jsDynamicCast<Process*>(JSValue::decode(thisValue));
+    if (!process) {
+        return JSValue::encode(jsUndefined());
     }
 
-    return ret;
+    return JSValue::encode(jsNumber(Bun__getExitCode(jsCast<Zig::GlobalObject*>(process->globalObject())->bunVM())));
 }
-
-JSC_DEFINE_CUSTOM_GETTER(Process_lazyExecArgvGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
+JSC_DEFINE_CUSTOM_SETTER(setProcessExitCode, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, JSC::EncodedJSValue value, JSC::PropertyName))
 {
-    JSC::JSObject* thisObject = JSValue::decode(thisValue).getObject();
-    EncodedJSValue ret = Bun__Process__getExecArgv(globalObject);
-
-    if (LIKELY(thisObject)) {
-        thisObject->putDirect(globalObject->vm(), name, JSValue::decode(ret), 0);
+    Process* process = jsDynamicCast<Process*>(JSValue::decode(thisValue));
+    if (!process) {
+        return false;
     }
 
-    return ret;
-}
+    auto throwScope = DECLARE_THROW_SCOPE(process->vm());
+    JSValue exitCode = JSValue::decode(value);
+    if (!exitCode.isNumber()) {
+        throwTypeError(lexicalGlobalObject, throwScope, "exitCode must be a number"_s);
+        return false;
+    }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_lazyExecPathGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
-{
-    JSC::JSObject* thisObject = JSValue::decode(thisValue).getObject();
-    EncodedJSValue ret = Bun__Process__getExecPath(globalObject);
+    if (!exitCode.isInt32()) {
+        throwRangeError(lexicalGlobalObject, throwScope, "The \"code\" argument must be an integer"_s);
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
 
-    if (LIKELY(thisObject)) {
-        thisObject->putDirect(globalObject->vm(), name, JSValue::decode(ret), 0);
+    int exitCodeInt = exitCode.toInt32(lexicalGlobalObject);
+    RETURN_IF_EXCEPTION(throwScope, false);
+    if (exitCodeInt < 0 || exitCodeInt > 127) {
+        throwRangeError(lexicalGlobalObject, throwScope, "exitCode must be between 0 and 127"_s);
+        return false;
     }
 
-    return ret;
+    void* ptr = jsCast<Zig::GlobalObject*>(process->globalObject())->bunVM();
+    Bun__setExitCode(ptr, static_cast<uint8_t>(exitCodeInt));
+    return true;
 }
 
-void Process::finishCreation(JSC::VM& vm)
+static JSValue constructVersions(VM& vm, JSObject* processObject)
 {
-    Base::finishCreation(vm);
-    auto clientData = WebCore::clientData(vm);
-    auto* globalObject = reinterpret_cast<Zig::GlobalObject*>(this->globalObject());
-
-    putDirectCustomAccessor(vm, clientData->builtinNames().pidPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getPID, nullptr),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirectCustomAccessor(vm, clientData->builtinNames().ppidPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getPPID, nullptr),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "title"_s),
-        JSC::CustomGetterSetter::create(vm, Process_getTitle, Process_setTitle),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirectCustomAccessor(vm, clientData->builtinNames().argvPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getArgv, Process_setArgv),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirect(vm, JSC::Identifier::fromString(vm, "revision"_s),
-        JSC::jsString(vm, makeAtomString(Bun__version_sha)), 0);
-
-    this->putDirect(vm, clientData->builtinNames().nextTickPublicName(),
-        JSC::JSFunction::create(vm, globalObject, 1,
-            MAKE_STATIC_STRING_IMPL("nextTick"), Process_functionNextTick, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, JSC::Identifier::fromString(vm, "dlopen"_s),
-        JSC::JSFunction::create(vm, globalObject, 1,
-            MAKE_STATIC_STRING_IMPL("dlopen"), Process_functionDlopen, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, clientData->builtinNames().cwdPublicName(),
-        JSC::JSFunction::create(vm, globalObject, 0,
-            MAKE_STATIC_STRING_IMPL("cwd"), Process_functionCwd, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, clientData->builtinNames().chdirPublicName(),
-        JSC::JSFunction::create(vm, globalObject, 0,
-            MAKE_STATIC_STRING_IMPL("chdir"), Process_functionChdir, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, JSC::Identifier::fromString(vm, "exit"_s),
-        JSC::JSFunction::create(vm, globalObject, 0,
-            MAKE_STATIC_STRING_IMPL("exit"), Process_functionExit, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    putDirectCustomAccessor(
-        vm, clientData->builtinNames().versionsPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getVersionsLazy, Process_setVersionsLazy), 0);
-    // this should be transpiled out, but just incase
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "browser"_s),
-        JSC::JSValue(false));
-
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "exitCode"_s),
-        JSC::JSValue(JSC::jsNumber(0)));
-
-    this->putDirect(this->vm(), clientData->builtinNames().versionPublicName(),
-        JSC::jsString(this->vm(), makeString("v", REPORTED_NODE_VERSION)));
-
-    // this gives some way of identifying at runtime whether the SSR is happening in node or not.
-    // this should probably be renamed to what the name of the bundler is, instead of "notNodeJS"
-    // but it must be something that won't evaluate to truthy in Node.js
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "isBun"_s), JSC::JSValue(true));
-#if defined(__APPLE__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "platform"_s),
-        JSC::jsString(this->vm(), makeAtomString("darwin")));
-#else
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "platform"_s),
-        JSC::jsString(this->vm(), makeAtomString("linux")));
-#endif
-
-#if defined(__x86_64__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("x64")));
-#elif defined(__i386__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("x86")));
-#elif defined(__arm__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("arm")));
-#elif defined(__aarch64__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("arm64")));
-#endif
-
-    JSC::JSFunction* hrtime = JSC::JSFunction::create(vm, globalObject, 0,
-        MAKE_STATIC_STRING_IMPL("hrtime"), Process_functionHRTime, ImplementationVisibility::Public);
-
-    JSC::JSFunction* hrtimeBigInt = JSC::JSFunction::create(vm, globalObject, 0,
-        MAKE_STATIC_STRING_IMPL("bigint"), Process_functionHRTimeBigInt, ImplementationVisibility::Public);
-
-    hrtime->putDirect(vm, JSC::Identifier::fromString(vm, "bigint"_s), hrtimeBigInt);
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "hrtime"_s), hrtime);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "release"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_getterRelease, Process_setterRelease), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "stdout"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyStdoutGetter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "stderr"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyStderrGetter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "stdin"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyStdinGetter, Process_defaultSetter), 0);
-
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "abort"_s),
-        0, Process_functionAbort, ImplementationVisibility::Public, NoIntrinsic, 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "argv0"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyArgv0Getter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "execPath"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyExecPathGetter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "execArgv"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyExecArgvGetter, Process_defaultSetter), 0);
+    auto* globalObject = processObject->globalObject();
+    JSC::JSObject* object = JSC::constructEmptyObject(globalObject, globalObject->objectPrototype(), 19);
 
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "uptime"_s),
-        0, Process_functionUptime, ImplementationVisibility::Public, NoIntrinsic, 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "node"_s),
+        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(REPORTED_NODE_VERSION))));
+    object->putDirect(
+        vm, JSC::Identifier::fromString(vm, "bun"_s),
+        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(Bun__version + 1 /* prefix with v */))));
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
+        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(BUN_WEBKIT_VERSION))));
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "boringssl"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_boringssl))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "libarchive"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_libarchive))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "mimalloc"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_mimalloc))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "picohttpparser"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_picohttpparser))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "uwebsockets"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_uws))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_webkit))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "zig"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zig))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "zlib"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zlib))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "tinycc"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_tinycc))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "lolhtml"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_lolhtml))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "ares"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_c_ares))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "usockets"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_usockets))), 0);
 
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "umask"_s),
-        1, Process_functionUmask, ImplementationVisibility::Public, NoIntrinsic, 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "v8"_s), JSValue(JSC::jsString(vm, makeString("10.8.168.20-node.8"_s))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "uv"_s), JSValue(JSC::jsString(vm, makeString("1.44.2"_s))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "napi"_s), JSValue(JSC::jsString(vm, makeString("8"_s))), 0);
 
-    this->putDirectBuiltinFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "binding"_s),
-        processObjectInternalsBindingCodeGenerator(vm),
-        0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "modules"_s),
+        JSC::JSValue(JSC::jsString(vm, makeAtomString("108"))));
 
-    this->putDirect(vm, vm.propertyNames->toStringTagSymbol, jsString(vm, String("process"_s)), 0);
+    return object;
+}
 
+static JSValue constructProcessConfigObject(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
     //   target_defaults:
     //    { cflags: [],
     //      default_configuration: 'Release',
@@ -783,168 +820,779 @@ void Process::finishCreation(JSC::VM& vm)
         JSC::jsNumber(1), 0);
     config->putDirect(vm, JSC::Identifier::fromString(vm, "target_defaults"_s), JSC::constructEmptyObject(globalObject), 0);
     config->putDirect(vm, JSC::Identifier::fromString(vm, "variables"_s), variables, 0);
-    this->putDirect(vm, JSC::Identifier::fromString(vm, "config"_s), config, 0);
 
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "emitWarning"_s),
-        1, Process_emitWarning, ImplementationVisibility::Public, NoIntrinsic, 0);
+    return config;
 }
 
-const JSC::ClassInfo Process::s_info = { "Process"_s, &Base::s_info, nullptr, nullptr,
-    CREATE_METHOD_TABLE(Process) };
+static JSValue constructProcessReleaseObject(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    auto* release = JSC::constructEmptyObject(globalObject);
+    release->putDirect(vm, Identifier::fromString(vm, "name"_s), jsString(vm, WTF::String("bun"_s)), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "lts"_s), jsBoolean(false), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "sourceUrl"_s), jsString(vm, WTF::String(Bun__githubURL, strlen(Bun__githubURL))), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "headersUrl"_s), jsEmptyString(vm), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "libUrl"_s), jsEmptyString(vm), 0);
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getTitle, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+    return release;
+}
+
+static JSValue constructProcessHrtimeObject(VM& vm, JSObject* processObject)
 {
-    ZigString str;
-    Bun__Process__getTitle(globalObject, &str);
-    return JSValue::encode(Zig::toJSStringValue(str, globalObject));
+    auto* globalObject = processObject->globalObject();
+    JSC::JSFunction* hrtime = JSC::JSFunction::create(vm, globalObject, 0,
+        String("hrtime"_s), Process_functionHRTime, ImplementationVisibility::Public);
+
+    JSC::JSFunction* hrtimeBigInt = JSC::JSFunction::create(vm, globalObject, 0,
+        String("bigint"_s), Process_functionHRTimeBigInt, ImplementationVisibility::Public);
+
+    hrtime->putDirect(vm, JSC::Identifier::fromString(vm, "bigint"_s), hrtimeBigInt);
+
+    return hrtime;
 }
 
-JSC_DEFINE_CUSTOM_SETTER(Process_setTitle,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName))
+static JSValue constructStdioWriteStream(JSC::JSGlobalObject* globalObject, int fd)
+{
+    auto& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdioWriteStreamCodeGenerator(vm), globalObject);
+    JSC::MarkedArgumentBuffer args;
+    args.append(JSC::jsNumber(fd));
+
+    auto clientData = WebCore::clientData(vm);
+    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
+
+    NakedPtr<JSC::Exception> returnedException = nullptr;
+    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject->globalThis(), args, returnedException);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    if (returnedException) {
+        throwException(globalObject, scope, returnedException.get());
+        return {};
+    }
+
+    return result;
+}
+
+static JSValue constructStdout(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = Bun__getDefaultGlobal();
+    return constructStdioWriteStream(globalObject, 1);
+}
+
+static JSValue constructStderr(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = Bun__getDefaultGlobal();
+    return constructStdioWriteStream(globalObject, 2);
+}
+
+static JSValue constructStdin(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = Bun__getDefaultGlobal();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    auto* thisObject = reinterpret_cast<Zig::GlobalObject*>(globalObject);
+    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdinStreamCodeGenerator(vm), globalObject);
+    JSC::MarkedArgumentBuffer args;
+    args.append(JSC::jsNumber(STDIN_FILENO));
+
+    auto clientData = WebCore::clientData(vm);
+    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
+
+    NakedPtr<JSC::Exception> returnedException = nullptr;
+    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject, args, returnedException);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    if (UNLIKELY(returnedException)) {
+        throwException(globalObject, scope, returnedException.get());
+        return {};
+    }
+
+    RELEASE_AND_RETURN(scope, result);
+}
+
+static JSValue constructPid(VM& vm, JSObject* processObject)
+{
+    return jsNumber(getpid());
+}
+
+static JSValue constructPpid(VM& vm, JSObject* processObject)
+{
+    return jsNumber(getppid());
+}
+
+static JSValue constructArgv0(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getArgv0(globalObject));
+}
+
+static JSValue constructExecArgv(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getExecArgv(globalObject));
+}
+
+static JSValue constructExecPath(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getExecPath(globalObject));
+}
+
+static JSValue constructArgv(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getArgv(globalObject));
+}
+
+static JSValue constructArch(VM& vm, JSObject* processObject)
+{
+#if defined(__x86_64__)
+    return JSC::jsString(vm, makeAtomString("x64"));
+#elif defined(__i386__)
+    return JSC::jsString(vm, makeAtomString("x86"));
+#elif defined(__arm__)
+    return JSC::jsString(vm, makeAtomString("arm"));
+#elif defined(__aarch64__)
+    return JSC::jsString(vm, makeAtomString("arm64"));
+#else
+#error "Unknown architecture"
+#endif
+}
+
+static JSValue constructPlatform(VM& vm, JSObject* processObject)
+{
+#if defined(__APPLE__)
+    return JSC::jsString(vm, makeAtomString("darwin"));
+#elif defined(__linux__)
+    return JSC::jsString(vm, makeAtomString("linux"));
+#else
+#error "Unknown platform"
+#endif
+}
+
+static JSValue constructBrowser(VM& vm, JSObject* processObject)
+{
+    return jsBoolean(false);
+}
+
+static JSValue constructVersion(VM& vm, JSObject* processObject)
+{
+    return JSC::jsString(vm, makeString("v", REPORTED_NODE_VERSION));
+}
+
+static JSValue constructIsBun(VM& vm, JSObject* processObject)
+{
+    return jsBoolean(true);
+}
+
+static JSValue constructRevision(VM& vm, JSObject* processObject)
+{
+    return JSC::jsString(vm, makeAtomString(Bun__version_sha));
+}
+
+static JSValue constructEnv(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(processObject->globalObject());
+    return globalObject->processEnvObject();
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetuid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(getuid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongeteuid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(geteuid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetegid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(getegid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetgid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(getgid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetgroups, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    int ngroups = getgroups(0, nullptr);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    if (ngroups == -1) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("getgroups"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
+
+    gid_t egid = getegid();
+    JSArray* groups = constructEmptyArray(globalObject, nullptr, static_cast<unsigned int>(ngroups));
+    Vector<gid_t> groupVector(ngroups);
+    getgroups(1, &egid);
+    bool needsEgid = true;
+    for (unsigned i = 0; i < ngroups; i++) {
+        auto current = groupVector[i];
+        if (current == needsEgid) {
+            needsEgid = false;
+        }
+
+        groups->putDirectIndex(globalObject, i, jsNumber(current));
+    }
+
+    if (needsEgid)
+        groups->push(globalObject, jsNumber(egid));
+
+    return JSValue::encode(groups);
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionAssert, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    JSValue arg0 = callFrame->argument(0);
+    bool condition = arg0.toBoolean(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+    if (condition) {
+        return JSValue::encode(jsUndefined());
+    }
+
+    JSValue arg1 = callFrame->argument(1);
+    String message = arg1.isUndefined() ? String() : arg1.toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+    auto error = createError(globalObject, makeString("Assertion failed: "_s, message));
+    error->putDirect(vm, Identifier::fromString(vm, "code"_s), jsString(vm, makeString("ERR_ASSERTION"_s)));
+    throwException(globalObject, throwScope, error);
+    return JSValue::encode(jsUndefined());
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionReallyExit, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    uint8_t exitCode = 0;
+    JSValue arg0 = callFrame->argument(0);
+    if (arg0.isNumber()) {
+        if (!arg0.isInt32()) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        int extiCode32 = arg0.toInt32(globalObject);
+        RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
+
+        if (extiCode32 < 0 || extiCode32 > 127) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer between 0 and 127"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        exitCode = static_cast<uint8_t>(extiCode32);
+    } else if (!arg0.isUndefinedOrNull()) {
+        throwTypeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+        return JSC::JSValue::encode(JSC::JSValue {});
+    } else {
+        exitCode = Bun__getExitCode(Bun__getVM());
+    }
+
+    auto* zigGlobal = jsDynamicCast<Zig::GlobalObject*>(globalObject);
+    if (UNLIKELY(!zigGlobal)) {
+        zigGlobal = Bun__getDefaultGlobal();
+    }
+    Bun__Process__exit(zigGlobal, exitCode);
+    __builtin_unreachable();
+}
+
+template<typename Visitor>
+void Process::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    Process* thisObject = jsCast<Process*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    Base::visitChildren(thisObject, visitor);
+    thisObject->cpuUsageStructure.visit(visitor);
+    thisObject->memoryUsageStructure.visit(visitor);
+}
+
+DEFINE_VISIT_CHILDREN(Process);
+
+static Structure* constructCPUUsageStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
+{
+    JSC::Structure* structure = globalObject->structureCache().emptyObjectStructureForPrototype(globalObject, globalObject->objectPrototype(), 2);
+    PropertyOffset offset;
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "user"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "system"_s),
+        0,
+        offset);
+    return structure;
+}
+static Structure* constructMemoryUsageStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
+{
+    JSC::Structure* structure = globalObject->structureCache().emptyObjectStructureForPrototype(globalObject, globalObject->objectPrototype(), 5);
+    PropertyOffset offset;
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "rss"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "heapTotal"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "heapUsed"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "external"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "arrayBuffers"_s),
+        0,
+        offset);
+
+    return structure;
+}
+
+static Process* getProcessObject(JSC::JSGlobalObject* lexicalGlobalObject, JSValue thisValue)
+{
+    Process* process = jsDynamicCast<Process*>(thisValue);
+
+    // Handle "var memoryUsage = process.memoryUsage; memoryUsage()"
+    if (UNLIKELY(!process)) {
+        // Handle calling this function from inside a node:vm
+        Zig::GlobalObject* zigGlobalObject = jsDynamicCast<Zig::GlobalObject*>(lexicalGlobalObject);
+
+        if (UNLIKELY(!zigGlobalObject)) {
+            zigGlobalObject = Bun__getDefaultGlobal();
+        }
+
+        return jsCast<Process*>(zigGlobalObject->processObject());
+    }
+
+    return process;
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionCpuUsage,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    struct rusage rusage;
+    if (getrusage(RUSAGE_SELF, &rusage) != 0) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("getrusage"_s);
+        error.message = Bun::toString("Failed to get CPU usage"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
 
-    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
-    JSC::JSString* jsString = JSC::jsDynamicCast<JSC::JSString*>(JSValue::decode(value));
-    if (!thisObject || !jsString) {
-        return false;
+    auto* process = getProcessObject(globalObject, callFrame->thisValue());
+
+    Structure* cpuUsageStructure = process->cpuUsageStructure.getInitializedOnMainThread(process);
+
+    constexpr double MICROS_PER_SEC = 1000000.0;
+
+    double user = MICROS_PER_SEC * rusage.ru_utime.tv_sec + rusage.ru_utime.tv_usec;
+    double system = MICROS_PER_SEC * rusage.ru_stime.tv_sec + rusage.ru_stime.tv_usec;
+
+    if (callFrame->argumentCount() > 0) {
+        JSValue comparatorValue = callFrame->argument(0);
+        if (!comparatorValue.isUndefined()) {
+            if (UNLIKELY(!comparatorValue.isObject())) {
+                throwTypeError(globalObject, throwScope, "Expected an object as the first argument"_s);
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            JSC::JSObject* comparator = comparatorValue.getObject();
+            JSValue userValue;
+            JSValue systemValue;
+
+            if (LIKELY(comparator->structureID() == cpuUsageStructure->id())) {
+                userValue = comparator->getDirect(0);
+                systemValue = comparator->getDirect(1);
+            } else {
+                userValue = comparator->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "user"_s));
+                RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::jsUndefined()));
+
+                systemValue = comparator->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "system"_s));
+                RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::jsUndefined()));
+            }
+
+            if (UNLIKELY(!userValue || !userValue.isNumber())) {
+                throwTypeError(globalObject, throwScope, "Expected a number for the user property"_s);
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            if (UNLIKELY(!systemValue || !systemValue.isNumber())) {
+                throwTypeError(globalObject, throwScope, "Expected a number for the system property"_s);
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            double userComparator = userValue.asNumber();
+            double systemComparator = systemValue.asNumber();
+
+            user -= userComparator;
+            system -= systemComparator;
+        }
     }
 
-    ZigString str = Zig::toZigString(jsString, globalObject);
-    Bun__Process__setTitle(globalObject, &str);
+    JSC::JSObject* result = JSC::constructEmptyObject(vm, cpuUsageStructure);
+    RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::jsUndefined()));
 
-    return true;
+    result->putDirectOffset(vm, 0, JSC::jsNumber(user));
+    result->putDirectOffset(vm, 1, JSC::jsNumber(system));
+
+    RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(result));
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getArgv, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+static int getRSS(size_t* rss)
+{
+#if defined(__APPLE__)
+    mach_msg_type_number_t count;
+    task_basic_info_data_t info;
+    kern_return_t err;
+
+    count = TASK_BASIC_INFO_COUNT;
+    err = task_info(mach_task_self(),
+        TASK_BASIC_INFO,
+        reinterpret_cast<task_info_t>(&info),
+        &count);
+
+    if (err == KERN_SUCCESS) {
+        *rss = (size_t)info.resident_size;
+        return 0;
+    }
+
+    return -1;
+#elif defined(__linux__)
+    // Taken from libuv.
+    char buf[1024];
+    const char* s;
+    ssize_t n;
+    long val;
+    int fd;
+    int i;
+
+    do
+        fd = open("/proc/self/stat", O_RDONLY);
+    while (fd == -1 && errno == EINTR);
+
+    if (fd == -1)
+        return errno;
+
+    do
+        n = read(fd, buf, sizeof(buf) - 1);
+    while (n == -1 && errno == EINTR);
+
+    int closeErrno = 0;
+    do {
+        closeErrno = close(fd);
+    } while (closeErrno == -1 && errno == EINTR);
+
+    if (n == -1)
+        return errno;
+    buf[n] = '\0';
+
+    s = strchr(buf, ' ');
+    if (s == NULL)
+        goto err;
+
+    s += 1;
+    if (*s != '(')
+        goto err;
+
+    s = strchr(s, ')');
+    if (s == NULL)
+        goto err;
+
+    for (i = 1; i <= 22; i++) {
+        s = strchr(s + 1, ' ');
+        if (s == NULL)
+            goto err;
+    }
+
+    errno = 0;
+    val = strtol(s, NULL, 10);
+    if (errno != 0)
+        goto err;
+    if (val < 0)
+        goto err;
+
+    *rss = val * getpagesize();
+    return 0;
+
+err:
+    return EINVAL;
+#else
+#error "Unsupported platform"
+#endif
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionMemoryUsage,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    auto* process = getProcessObject(globalObject, callFrame->thisValue());
+
+    size_t current_rss = 0;
+    if (getRSS(&current_rss) != 0) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("memoryUsage"_s);
+        error.message = Bun::toString("Failed to get memory usage"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
 
-    Zig::Process* thisObject = JSC::jsDynamicCast<Zig::Process*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return JSValue::encode(JSC::jsUndefined());
+    JSC::JSObject* result = JSC::constructEmptyObject(vm, process->memoryUsageStructure.getInitializedOnMainThread(process));
+    if (UNLIKELY(throwScope.exception())) {
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    JSC::EncodedJSValue argv_ = Bun__Process__getArgv(globalObject);
-    auto clientData = WebCore::clientData(vm);
+    // Node.js:
+    // {
+    //    rss: 4935680,
+    //    heapTotal: 1826816,
+    //    heapUsed: 650472,
+    //    external: 49879,
+    //    arrayBuffers: 9386
+    // }
 
-    thisObject->putDirect(vm, clientData->builtinNames().argvPublicName(),
-        JSC::JSValue::decode(argv_), 0);
+    result->putDirectOffset(vm, 0, JSC::jsNumber(current_rss));
+    result->putDirectOffset(vm, 1, JSC::jsNumber(vm.heap.blockBytesAllocated()));
 
-    return argv_;
+    // heap.size() loops through every cell...
+    // TODO: add a binding for heap.sizeAfterLastCollection()
+    result->putDirectOffset(vm, 2, JSC::jsNumber(vm.heap.sizeAfterLastEdenCollection()));
+
+    result->putDirectOffset(vm, 3, JSC::jsNumber(vm.heap.externalMemorySize()));
+
+    // We report 0 for this because m_arrayBuffers in JSC::Heap is private and we need to add a binding
+    // If we use objectTypeCounts(), it's hideously slow because it loops through every single object in the heap
+    // TODO: add a binding for m_arrayBuffers, registerWrapper() in TypedArrayController doesn't work
+    result->putDirectOffset(vm, 4, JSC::jsNumber(0));
+
+    RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(result));
 }
 
-JSC_DEFINE_CUSTOM_SETTER(Process_setArgv,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName))
+JSC_DEFINE_HOST_FUNCTION(Process_functionMemoryUsageRSS,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
 
-    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return false;
+    size_t current_rss = 0;
+    if (getRSS(&current_rss) != 0) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("memoryUsage"_s);
+        error.message = Bun::toString("Failed to get memory usage"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    auto clientData = WebCore::clientData(vm);
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsNumber(current_rss)));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionOpenStdin, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    Zig::GlobalObject* global = jsDynamicCast<Zig::GlobalObject*>(globalObject);
+    if (UNLIKELY(!global)) {
+        global = Bun__getDefaultGlobal();
+    }
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    if (JSValue stdin = global->processObject()->getIfPropertyExists(globalObject, Identifier::fromString(vm, "stdin"_s))) {
+        RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+
+        if (!stdin.isObject()) {
+            throwTypeError(globalObject, throwScope, "stdin is not an object"_s);
+            return JSValue::encode(jsUndefined());
+        }
+
+        JSValue resumeValue = stdin.getObject()->getIfPropertyExists(globalObject, Identifier::fromString(vm, "resume"_s));
+        RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+        if (!resumeValue.isUndefinedOrNull()) {
+            auto resumeFunction = jsDynamicCast<JSFunction*>(resumeValue);
+            if (UNLIKELY(!resumeFunction)) {
+                throwTypeError(globalObject, throwScope, "stdin.resume is not a function"_s);
+                return JSValue::encode(jsUndefined());
+            }
+
+            auto callData = getCallData(resumeFunction);
+
+            MarkedArgumentBuffer args;
+            JSC::call(globalObject, resumeFunction, callData, stdin, args);
+            RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+        }
+
+        RELEASE_AND_RETURN(throwScope, JSValue::encode(stdin));
+    }
 
-    return thisObject->putDirect(vm, clientData->builtinNames().argvPublicName(),
-        JSC::JSValue::decode(value));
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsUndefined()));
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getPID, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+JSC_DEFINE_HOST_FUNCTION(Process_stubEmptyFunction, (JSGlobalObject * globalObject, CallFrame* callFrame))
 {
-    return JSC::JSValue::encode(JSC::JSValue(getpid()));
+    return JSValue::encode(jsUndefined());
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getPPID, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+JSC_DEFINE_HOST_FUNCTION(Process_stubFunctionReturningArray, (JSGlobalObject * globalObject, CallFrame* callFrame))
 {
-    return JSC::JSValue::encode(JSC::JSValue(getppid()));
+    return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr));
 }
 
-#if !defined(BUN_WEBKIT_VERSION)
-#define BUN_WEBKIT_VERSION "unknown"
-#endif
+static JSValue Process_stubEmptyObject(VM& vm, JSObject* processObject)
+{
+    return JSC::constructEmptyObject(processObject->globalObject());
+}
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getVersionsLazy,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::PropertyName))
+static JSValue Process_stubEmptyArray(VM& vm, JSObject* processObject)
 {
-    JSC::VM& vm = globalObject->vm();
-    auto clientData = WebCore::clientData(vm);
+    return JSC::constructEmptyArray(processObject->globalObject(), nullptr);
+}
 
-    Zig::Process* thisObject = JSC::jsDynamicCast<Zig::Process*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return JSValue::encode(JSC::jsUndefined());
+static JSValue Process_stubEmptySet(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSSet::create(vm, globalObject->setStructure());
+}
+
+static JSValue constructMemoryUsage(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    JSC::JSFunction* memoryUsage = JSC::JSFunction::create(vm, globalObject, 0,
+        String("memoryUsage"_s), Process_functionMemoryUsage, ImplementationVisibility::Public);
+
+    JSC::JSFunction* rss = JSC::JSFunction::create(vm, globalObject, 0,
+        String("rss"_s), Process_functionMemoryUsageRSS, ImplementationVisibility::Public);
+
+    memoryUsage->putDirect(vm, JSC::Identifier::fromString(vm, "rss"_s), rss, JSC::PropertyAttribute::Function | 0);
+    return memoryUsage;
+}
+
+static JSValue constructFeatures(VM& vm, JSObject* processObject)
+{
+    // {
+    //     inspector: true,
+    //     debug: false,
+    //     uv: true,
+    //     ipv6: true,
+    //     tls_alpn: true,
+    //     tls_sni: true,
+    //     tls_ocsp: true,
+    //     tls: true,
+    //     cached_builtins: [Getter]
+    // }
+    auto* globalObject = processObject->globalObject();
+    auto* object = constructEmptyObject(globalObject);
+
+    object->putDirect(vm, Identifier::fromString(vm, "inspector"_s), jsBoolean(true));
+#ifdef BUN_DEBUG
+    object->putDirect(vm, Identifier::fromString(vm, "debug"_s), jsBoolean(true));
+#else
+    object->putDirect(vm, Identifier::fromString(vm, "debug"_s), jsBoolean(false));
+#endif
+    // lying
+    object->putDirect(vm, Identifier::fromString(vm, "uv"_s), jsBoolean(true));
+
+    object->putDirect(vm, Identifier::fromString(vm, "ipv6"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls_alpn"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls_sni"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls_ocsp"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "cached_builtins"_s), jsBoolean(true));
+
+    return object;
+}
+
+static int _debugPort;
+
+JSC_DEFINE_CUSTOM_GETTER(processDebugPort, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    if (_debugPort == 0) {
+        _debugPort = 9229;
     }
-    auto scope = DECLARE_THROW_SCOPE(vm);
 
-    JSC::JSObject* object = JSC::constructEmptyObject(globalObject, globalObject->objectPrototype(), 19);
+    return JSC::JSValue::encode(jsNumber(_debugPort));
+}
 
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "node"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(REPORTED_NODE_VERSION))));
-    object->putDirect(
-        vm, JSC::Identifier::fromString(vm, "bun"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(Bun__version + 1 /* prefix with v */))));
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(BUN_WEBKIT_VERSION))));
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "boringssl"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_boringssl))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "libarchive"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_libarchive))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "mimalloc"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_mimalloc))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "picohttpparser"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_picohttpparser))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "uwebsockets"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_uws))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_webkit))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "zig"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zig))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "zlib"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zlib))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "tinycc"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_tinycc))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "lolhtml"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_lolhtml))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "ares"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_c_ares))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "usockets"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_usockets))), 0);
+JSC_DEFINE_CUSTOM_SETTER(setProcessDebugPort,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue encodedValue, JSC::PropertyName))
+{
+    auto& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    JSValue value = JSValue::decode(encodedValue);
 
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "v8"_s), JSValue(JSC::jsString(vm, makeString("10.8.168.20-node.8"_s))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "uv"_s), JSValue(JSC::jsString(vm, makeString("1.44.2"_s))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "napi"_s), JSValue(JSC::jsString(vm, makeString("8"_s))), 0);
+    if (!value.isInt32()) {
+        throwRangeError(globalObject, scope, "debugPort must be 0 or in range 1024 to 65535"_s);
+        return false;
+    }
 
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "modules"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString("108"))));
+    int port = value.asInt32();
 
-    thisObject->putDirect(vm, clientData->builtinNames().versionsPublicName(), object, 0);
+    if (port != 0) {
+        if (port < 1024 || port > 65535) {
+            throwRangeError(globalObject, scope, "debugPort must be 0 or in range 1024 to 65535"_s);
+            return false;
+        }
+    }
 
-    RETURN_IF_EXCEPTION(scope, {});
+    _debugPort = port;
+    return true;
+}
 
-    return JSValue::encode(object);
+JSC_DEFINE_CUSTOM_GETTER(processTitle, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    ZigString str;
+    Bun__Process__getTitle(globalObject, &str);
+    return JSValue::encode(Zig::toJSStringValue(str, globalObject));
 }
-JSC_DEFINE_CUSTOM_SETTER(Process_setVersionsLazy,
+
+JSC_DEFINE_CUSTOM_SETTER(setProcessTitle,
     (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
         JSC::EncodedJSValue value, JSC::PropertyName))
 {
     JSC::VM& vm = globalObject->vm();
-    auto clientData = WebCore::clientData(vm);
 
-    Zig::Process* thisObject = JSC::jsDynamicCast<Zig::Process*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return JSValue::encode(JSC::jsUndefined());
+    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
+    JSC::JSString* jsString = JSC::jsDynamicCast<JSC::JSString*>(JSValue::decode(value));
+    if (!thisObject || !jsString) {
+        return false;
     }
 
-    thisObject->putDirect(vm, clientData->builtinNames().versionsPublicName(),
-        JSC::JSValue::decode(value), 0);
+    ZigString str = Zig::toZigString(jsString, globalObject);
+    Bun__Process__setTitle(globalObject, &str);
 
     return true;
 }
 
-static JSC_DEFINE_HOST_FUNCTION(Process_functionCwd,
+JSC_DEFINE_HOST_FUNCTION(Process_functionCwd,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
@@ -958,4 +1606,162 @@ static JSC_DEFINE_HOST_FUNCTION(Process_functionCwd,
     return JSC::JSValue::encode(result);
 }
 
+JSC_DEFINE_HOST_FUNCTION(Process_functionReallyKill,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+
+    int pid = callFrame->argument(0).toInt32(globalObject);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    int signal = callFrame->argument(1).toInt32(globalObject);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    int result = kill(pid, signal);
+    if (result == -1) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("kill"_s);
+        throwException(globalObject, scope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
+
+    return JSValue::encode(jsUndefined());
+}
+JSC_DEFINE_HOST_FUNCTION(Process_functionKill,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+
+    int pid = callFrame->argument(0).toInt32(globalObject);
+    RETURN_IF_EXCEPTION(scope, {});
+    if (pid < 0) {
+        throwRangeError(globalObject, scope, "pid must be a positive integer"_s);
+        return JSValue::encode(jsUndefined());
+    }
+
+    JSC::JSValue signalValue = callFrame->argument(1);
+
+    int signal = SIGTERM;
+
+    if (signalValue.isNumber()) {
+        signal = signalValue.toInt32(globalObject);
+        RETURN_IF_EXCEPTION(scope, {});
+    } else if (signalValue.isString()) {
+        loadSignalNumberMap();
+        if (auto num = signalNameToNumberMap->get(signalValue.toWTFString(globalObject))) {
+            signal = num;
+            RETURN_IF_EXCEPTION(scope, {});
+        } else {
+            throwRangeError(globalObject, scope, "Unknown signal name"_s);
+            return JSValue::encode(jsUndefined());
+        }
+
+        RETURN_IF_EXCEPTION(scope, {});
+    } else if (!signalValue.isUndefinedOrNull()) {
+        throwTypeError(globalObject, scope, "signal must be a string or number"_s);
+        return JSValue::encode(jsUndefined());
+    }
+
+    int result = kill(pid, signal);
+
+    if (result == -1) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("kill"_s);
+        throwException(globalObject, scope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
+
+    return JSValue::encode(jsUndefined());
+}
+
+/* Source for Process.lut.h
+@begin processObjectTable
+  abort                            Process_functionAbort                    Function 1
+  allowedNodeEnvironmentFlags      Process_stubEmptySet                     PropertyCallback
+  arch                             constructArch                            PropertyCallback
+  argv                             constructArgv                            PropertyCallback
+  argv0                            constructArgv0                           PropertyCallback
+  assert                           Process_functionAssert                   Function 1
+  binding                          JSBuiltin                                Function 1
+  browser                          constructBrowser                         PropertyCallback
+  chdir                            Process_functionChdir                    Function 1
+  config                           constructProcessConfigObject             PropertyCallback
+  cpuUsage                         Process_functionCpuUsage                 Function 1
+  cwd                              Process_functionCwd                      Function 1
+  debugPort                        processDebugPort                         CustomAccessor
+  dlopen                           Process_functionDlopen                   Function 1
+  emitWarning                      Process_emitWarning                      Function 1
+  env                              constructEnv                             PropertyCallback
+  execArgv                         constructExecArgv                        PropertyCallback
+  execPath                         constructExecPath                        PropertyCallback
+  exit                             Process_functionExit                     Function 1
+  exitCode                         processExitCode                          CustomAccessor
+  features                         constructFeatures                        PropertyCallback
+  getActiveResourcesInfo           Process_stubFunctionReturningArray       Function 0
+  getegid                          Process_functiongetegid                  Function 0
+  geteuid                          Process_functiongeteuid                  Function 0
+  getgid                           Process_functiongetgid                   Function 0
+  getgroups                        Process_functiongetgroups                Function 0
+  getuid                           Process_functiongetuid                   Function 0
+  hrtime                           constructProcessHrtimeObject             PropertyCallback
+  isBun                            constructIsBun                           PropertyCallback
+  kill                             Process_functionKill                     Function 2
+  mainModule                       JSBuiltin                                ReadOnly|Builtin|Accessor|Function 0
+  memoryUsage                      constructMemoryUsage                     PropertyCallback
+  moduleLoadList                   Process_stubEmptyArray                   PropertyCallback
+  nextTick                         Process_functionNextTick                 Function 1
+  openStdin                        Process_functionOpenStdin                Function 0
+  pid                              constructPid                             PropertyCallback
+  platform                         constructPlatform                        PropertyCallback
+  ppid                             constructPpid                            PropertyCallback
+  reallyExit                       Process_functionReallyExit               Function 1
+  release                          constructProcessReleaseObject            PropertyCallback
+  revision                         constructRevision                        PropertyCallback
+  setSourceMapsEnabled             Process_stubEmptyFunction                Function 1
+  stderr                           constructStderr                          PropertyCallback
+  stdin                            constructStdin                           PropertyCallback
+  stdout                           constructStdout                          PropertyCallback
+  title                            processTitle                             CustomAccessor
+  umask                            Process_functionUmask                    Function 1
+  uptime                           Process_functionUptime                   Function 1
+  version                          constructVersion                         PropertyCallback
+  versions                         constructVersions                        PropertyCallback
+  _debugEnd                        Process_stubEmptyFunction                Function 0
+  _debugProcess                    Process_stubEmptyFunction                Function 0
+  _fatalException                  Process_stubEmptyFunction                Function 1
+  _getActiveRequests               Process_stubFunctionReturningArray       Function 0
+  _getActiveHandles                Process_stubFunctionReturningArray       Function 0
+  _linkedBinding                   Process_stubEmptyFunction                Function 0
+  _preload_modules                 Process_stubEmptyObject                  PropertyCallback
+  _rawDebug                        Process_stubEmptyFunction                Function 0
+  _startProfilerIdleNotifier       Process_stubEmptyFunction                Function 0
+  _stopProfilerIdleNotifier        Process_stubEmptyFunction                Function 0
+  _tickCallback                    Process_stubEmptyFunction                Function 0
+  _kill                            Process_functionReallyKill               Function 2
+@end
+*/
+
+#include "Process.lut.h"
+const JSC::ClassInfo Process::s_info = { "Process"_s, &Base::s_info, &processObjectTable, nullptr,
+    CREATE_METHOD_TABLE(Process) };
+
+void Process::finishCreation(JSC::VM& vm)
+{
+    Base::finishCreation(vm);
+
+    this->wrapped().onDidChangeListener = &onDidChangeListeners;
+
+    this->cpuUsageStructure.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::Structure>::Initializer& init) {
+        init.set(constructCPUUsageStructure(init.vm, init.owner->globalObject()));
+    });
+
+    this->memoryUsageStructure.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::Structure>::Initializer& init) {
+        init.set(constructMemoryUsageStructure(init.vm, init.owner->globalObject()));
+    });
+
+    this->putDirect(vm, vm.propertyNames->toStringTagSymbol, jsString(vm, String("process"_s)), 0);
+}
+
 } // namespace Zig
diff --git a/src/bun.js/bindings/Process.h b/src/bun.js/bindings/Process.h
index 322b39078..0ee6f4243 100644
--- a/src/bun.js/bindings/Process.h
+++ b/src/bun.js/bindings/Process.h
@@ -19,6 +19,8 @@ public:
     {
     }
 
+    void emitSignalEvent(int signalNumber);
+
     DECLARE_EXPORT_INFO;
 
     static void destroy(JSC::JSCell* cell)
@@ -28,7 +30,7 @@ public:
 
     ~Process();
 
-    static constexpr unsigned StructureFlags = Base::StructureFlags;
+    static constexpr unsigned StructureFlags = Base::StructureFlags | HasStaticPropertyTable;
 
     static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject,
         JSC::JSValue prototype)
@@ -45,6 +47,24 @@ public:
         return accessor;
     }
 
+    LazyProperty<JSObject, Structure> cpuUsageStructure;
+    LazyProperty<JSObject, Structure> memoryUsageStructure;
+
+    DECLARE_VISIT_CHILDREN;
+
+    template<typename, SubspaceAccess mode>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+            return nullptr;
+        return WebCore::subspaceForImpl<Process, WebCore::UseCustomHeapCellType::No>(
+            vm,
+            [](auto& spaces) { return spaces.m_clientSubspaceForProcessObject.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForProcessObject = std::forward<decltype(space)>(space); },
+            [](auto& spaces) { return spaces.m_subspaceForProcessObject.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_subspaceForProcessObject = std::forward<decltype(space)>(space); });
+    }
+
     void finishCreation(JSC::VM& vm);
 };
 
diff --git a/src/bun.js/bindings/Process.lut.h b/src/bun.js/bindings/Process.lut.h
new file mode 100644
index 000000000..81cf98c7d
--- /dev/null
+++ b/src/bun.js/bindings/Process.lut.h
@@ -0,0 +1,214 @@
+// File generated via `make generate-builtins`
+static const struct CompactHashIndex processObjectTableIndex[143] = {
+    { 44, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 15, 129 },
+    { -1, -1 },
+    { -1, -1 },
+    { 18, 139 },
+    { -1, -1 },
+    { 46, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 3, 142 },
+    { 1, 128 },
+    { -1, -1 },
+    { 60, -1 },
+    { -1, -1 },
+    { 10, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 32, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 53, -1 },
+    { 27, -1 },
+    { 12, -1 },
+    { -1, -1 },
+    { 19, -1 },
+    { -1, -1 },
+    { 14, 138 },
+    { -1, -1 },
+    { 37, -1 },
+    { -1, -1 },
+    { 39, -1 },
+    { 56, -1 },
+    { 36, -1 },
+    { 6, 140 },
+    { -1, -1 },
+    { 52, -1 },
+    { 4, -1 },
+    { 48, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 2, -1 },
+    { 7, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 41, -1 },
+    { -1, -1 },
+    { 29, 133 },
+    { -1, -1 },
+    { 0, -1 },
+    { 26, 136 },
+    { 16, 130 },
+    { 40, -1 },
+    { -1, -1 },
+    { 23, -1 },
+    { 11, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 59, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 31, 137 },
+    { -1, -1 },
+    { 30, -1 },
+    { 22, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 24, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 20, -1 },
+    { -1, -1 },
+    { 5, -1 },
+    { -1, -1 },
+    { 61, -1 },
+    { 49, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 13, 131 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 9, -1 },
+    { 25, 134 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 21, 135 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 47, 141 },
+    { -1, -1 },
+    { 17, -1 },
+    { 8, -1 },
+    { 28, -1 },
+    { 33, 132 },
+    { 34, -1 },
+    { 35, -1 },
+    { 38, -1 },
+    { 42, -1 },
+    { 43, -1 },
+    { 45, -1 },
+    { 50, -1 },
+    { 51, -1 },
+    { 54, -1 },
+    { 55, -1 },
+    { 57, -1 },
+    { 58, -1 },
+};
+
+static const struct HashTableValue processObjectTableValues[62] = {
+   { "abort"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionAbort, 1 } },
+   { "allowedNodeEnvironmentFlags"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, Process_stubEmptySet } },
+   { "arch"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructArch } },
+   { "argv"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructArgv } },
+   { "argv0"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructArgv0 } },
+   { "assert"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionAssert, 1 } },
+   { "binding"_s, ((static_cast<unsigned>(PropertyAttribute::Function)) & ~PropertyAttribute::Function) | PropertyAttribute::Builtin, NoIntrinsic, { HashTableValue::BuiltinGeneratorType, processObjectBindingCodeGenerator, 1 } },
+   { "browser"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructBrowser } },
+   { "chdir"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionChdir, 1 } },
+   { "config"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructProcessConfigObject } },
+   { "cpuUsage"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionCpuUsage, 1 } },
+   { "cwd"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionCwd, 1 } },
+   { "debugPort"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, processDebugPort, setProcessDebugPort } },
+   { "dlopen"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionDlopen, 1 } },
+   { "emitWarning"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_emitWarning, 1 } },
+   { "env"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructEnv } },
+   { "execArgv"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructExecArgv } },
+   { "execPath"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructExecPath } },
+   { "exit"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionExit, 1 } },
+   { "exitCode"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, processExitCode, setProcessExitCode } },
+   { "features"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructFeatures } },
+   { "getActiveResourcesInfo"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubFunctionReturningArray, 0 } },
+   { "getegid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetegid, 0 } },
+   { "geteuid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongeteuid, 0 } },
+   { "getgid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetgid, 0 } },
+   { "getgroups"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetgroups, 0 } },
+   { "getuid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetuid, 0 } },
+   { "hrtime"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructProcessHrtimeObject } },
+   { "isBun"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructIsBun } },
+   { "kill"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionKill, 2 } },
+   { "mainModule"_s, ((static_cast<unsigned>(PropertyAttribute::ReadOnly|PropertyAttribute::Builtin|PropertyAttribute::Accessor|PropertyAttribute::Function)) & ~PropertyAttribute::Function) | PropertyAttribute::Builtin, NoIntrinsic, { HashTableValue::BuiltinGeneratorType, processObjectMainModuleCodeGenerator, 0 } },
+   { "memoryUsage"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructMemoryUsage } },
+   { "moduleLoadList"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, Process_stubEmptyArray } },
+   { "nextTick"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionNextTick, 1 } },
+   { "openStdin"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionOpenStdin, 0 } },
+   { "pid"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructPid } },
+   { "platform"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructPlatform } },
+   { "ppid"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructPpid } },
+   { "reallyExit"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionReallyExit, 1 } },
+   { "release"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructProcessReleaseObject } },
+   { "revision"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructRevision } },
+   { "setSourceMapsEnabled"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 1 } },
+   { "stderr"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructStderr } },
+   { "stdin"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructStdin } },
+   { "stdout"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructStdout } },
+   { "title"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, processTitle, setProcessTitle } },
+   { "umask"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionUmask, 1 } },
+   { "uptime"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionUptime, 1 } },
+   { "version"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructVersion } },
+   { "versions"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructVersions } },
+   { "_debugEnd"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_debugProcess"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_fatalException"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 1 } },
+   { "_getActiveRequests"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubFunctionReturningArray, 0 } },
+   { "_getActiveHandles"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubFunctionReturningArray, 0 } },
+   { "_linkedBinding"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_preload_modules"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, Process_stubEmptyObject } },
+   { "_rawDebug"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_startProfilerIdleNotifier"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_stopProfilerIdleNotifier"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_tickCallback"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_kill"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionReallyKill, 2 } },
+};
+
+static const struct HashTable processObjectTable =
+    { 62, 127, true, nullptr, processObjectTableValues, processObjectTableIndex };
diff --git a/src/bun.js/bindings/ScriptExecutionContext.cpp b/src/bun.js/bindings/ScriptExecutionContext.cpp
index e8cae5e33..3262bdb5d 100644
--- a/src/bun.js/bindings/ScriptExecutionContext.cpp
+++ b/src/bun.js/bindings/ScriptExecutionContext.cpp
@@ -20,6 +20,12 @@ static HashMap<ScriptExecutionContextIdentifier, ScriptExecutionContext*>& allSc
     return contexts;
 }
 
+ScriptExecutionContext* ScriptExecutionContext::getScriptExecutionContext(ScriptExecutionContextIdentifier identifier)
+{
+    Locker locker { allScriptExecutionContextsMapLock };
+    return allScriptExecutionContextsMap().get(identifier);
+}
+
 template<bool SSL, bool isServer>
 static void registerHTTPContextForWebSocket(ScriptExecutionContext* script, us_socket_context_t* ctx, us_loop_t* loop)
 {
diff --git a/src/bun.js/bindings/ScriptExecutionContext.h b/src/bun.js/bindings/ScriptExecutionContext.h
index 5f6c56a90..aed7977a5 100644
--- a/src/bun.js/bindings/ScriptExecutionContext.h
+++ b/src/bun.js/bindings/ScriptExecutionContext.h
@@ -96,7 +96,12 @@ public:
         }
     }
 
-    const WTF::URL& url() const { return m_url; }
+    static ScriptExecutionContext* getScriptExecutionContext(ScriptExecutionContextIdentifier identifier);
+
+    const WTF::URL& url() const
+    {
+        return m_url;
+    }
     bool activeDOMObjectsAreSuspended() { return false; }
     bool activeDOMObjectsAreStopped() { return false; }
     bool isContextThread() { return true; }
@@ -141,6 +146,7 @@ public:
         auto* task = new EventLoopTask(WTFMove(lambda));
         postTaskOnTimeout(task, timeout);
     }
+
     template<typename... Arguments>
     void postCrossThreadTask(Arguments&&... arguments)
     {
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h b/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h
index b16febcdb..f0d491c0b 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h
@@ -8,6 +8,7 @@ std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectConstructor;std:
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectAnything;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectStringContaining;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectStringMatching;
+std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForFSWatcher;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForFileSystemRouter;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForFileSystemRouterConstructor;std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForListener;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForMD4;
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h b/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h
index 59263e62c..02a9adbca 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h
@@ -8,6 +8,7 @@ std::unique_ptr<IsoSubspace> m_subspaceForExpectConstructor;std::unique_ptr<IsoS
 std::unique_ptr<IsoSubspace> m_subspaceForExpectAnything;
 std::unique_ptr<IsoSubspace> m_subspaceForExpectStringContaining;
 std::unique_ptr<IsoSubspace> m_subspaceForExpectStringMatching;
+std::unique_ptr<IsoSubspace> m_subspaceForFSWatcher;
 std::unique_ptr<IsoSubspace> m_subspaceForFileSystemRouter;
 std::unique_ptr<IsoSubspace> m_subspaceForFileSystemRouterConstructor;std::unique_ptr<IsoSubspace> m_subspaceForListener;
 std::unique_ptr<IsoSubspace> m_subspaceForMD4;
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h
index 4471fbab3..ac03032e6 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h
@@ -58,6 +58,12 @@ JSC::Structure* JSExpectStringMatchingStructure() { return m_JSExpectStringMatch
   JSC::LazyClassStructure m_JSExpectStringMatching;
   bool hasJSExpectStringMatchingSetterValue { false };
   mutable JSC::WriteBarrier<JSC::Unknown> m_JSExpectStringMatchingSetterValue;
+JSC::Structure* JSFSWatcherStructure() { return m_JSFSWatcher.getInitializedOnMainThread(this); }
+        JSC::JSObject* JSFSWatcherConstructor() { return m_JSFSWatcher.constructorInitializedOnMainThread(this); }
+        JSC::JSValue JSFSWatcherPrototype() { return m_JSFSWatcher.prototypeInitializedOnMainThread(this); }
+  JSC::LazyClassStructure m_JSFSWatcher;
+  bool hasJSFSWatcherSetterValue { false };
+  mutable JSC::WriteBarrier<JSC::Unknown> m_JSFSWatcherSetterValue;
 JSC::Structure* JSFileSystemRouterStructure() { return m_JSFileSystemRouter.getInitializedOnMainThread(this); }
         JSC::JSObject* JSFileSystemRouterConstructor() { return m_JSFileSystemRouter.constructorInitializedOnMainThread(this); }
         JSC::JSValue JSFileSystemRouterPrototype() { return m_JSFileSystemRouter.prototypeInitializedOnMainThread(this); }
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h
index 4e5a2c1fa..b3b5327a4 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h
@@ -59,6 +59,12 @@ void GlobalObject::initGeneratedLazyClasses() {
                  init.setStructure(WebCore::JSExpectStringMatching::createStructure(init.vm, init.global, init.prototype));
                  
               });
+    m_JSFSWatcher.initLater(
+              [](LazyClassStructure::Initializer& init) {
+                 init.setPrototype(WebCore::JSFSWatcher::createPrototype(init.vm, reinterpret_cast<Zig::GlobalObject*>(init.global)));
+                 init.setStructure(WebCore::JSFSWatcher::createStructure(init.vm, init.global, init.prototype));
+                 
+              });
     m_JSFileSystemRouter.initLater(
               [](LazyClassStructure::Initializer& init) {
                  init.setPrototype(WebCore::JSFileSystemRouter::createPrototype(init.vm, reinterpret_cast<Zig::GlobalObject*>(init.global)));
@@ -211,6 +217,7 @@ void GlobalObject::visitGeneratedLazyClasses(GlobalObject *thisObject, Visitor&
       thisObject->m_JSExpectAnything.visit(visitor);  visitor.append(thisObject->m_JSExpectAnythingSetterValue);
       thisObject->m_JSExpectStringContaining.visit(visitor);  visitor.append(thisObject->m_JSExpectStringContainingSetterValue);
       thisObject->m_JSExpectStringMatching.visit(visitor);  visitor.append(thisObject->m_JSExpectStringMatchingSetterValue);
+      thisObject->m_JSFSWatcher.visit(visitor);  visitor.append(thisObject->m_JSFSWatcherSetterValue);
       thisObject->m_JSFileSystemRouter.visit(visitor);  visitor.append(thisObject->m_JSFileSystemRouterSetterValue);
       thisObject->m_JSListener.visit(visitor);  visitor.append(thisObject->m_JSListenerSetterValue);
       thisObject->m_JSMD4.visit(visitor);  visitor.append(thisObject->m_JSMD4SetterValue);
diff --git a/src/bun.js/bindings/ZigGeneratedClasses.cpp b/src/bun.js/bindings/ZigGeneratedClasses.cpp
index d51a1959a..b4d672328 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses.cpp
+++ b/src/bun.js/bindings/ZigGeneratedClasses.cpp
@@ -103,6 +103,9 @@ extern "C" void BlobClass__finalize(void*);
 extern "C" EncodedJSValue BlobPrototype__getArrayBuffer(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(BlobPrototype__arrayBufferCallback);
 
+extern "C" EncodedJSValue BlobPrototype__getExists(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(BlobPrototype__existsCallback);
+
 extern "C" EncodedJSValue BlobPrototype__getFormData(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(BlobPrototype__formDataCallback);
 
@@ -137,6 +140,7 @@ STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSBlobPrototype, JSBlobPrototype::Base);
 
 static const HashTableValue JSBlobPrototypeTableValues[] = {
     { "arrayBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__arrayBufferCallback, 0 } },
+    { "exists"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__existsCallback, 0 } },
     { "formData"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__formDataCallback, 0 } },
     { "json"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__jsonCallback, 0 } },
     { "lastModified"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, BlobPrototype__lastModifiedGetterWrap, 0 } },
@@ -190,6 +194,33 @@ JSC_DEFINE_HOST_FUNCTION(BlobPrototype__arrayBufferCallback, (JSGlobalObject * l
     return BlobPrototype__getArrayBuffer(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(BlobPrototype__existsCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSBlob* thisObject = jsDynamicCast<JSBlob*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return BlobPrototype__getExists(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(BlobPrototype__formDataCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -5381,6 +5412,307 @@ void JSExpectStringMatching::visitOutputConstraintsImpl(JSCell* cell, Visitor& v
 }
 
 DEFINE_VISIT_OUTPUT_CONSTRAINTS(JSExpectStringMatching);
+class JSFSWatcherPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+
+    static JSFSWatcherPrototype* create(JSC::VM& vm, JSGlobalObject* globalObject, JSC::Structure* structure)
+    {
+        JSFSWatcherPrototype* ptr = new (NotNull, JSC::allocateCell<JSFSWatcherPrototype>(vm)) JSFSWatcherPrototype(vm, globalObject, structure);
+        ptr->finishCreation(vm, globalObject);
+        return ptr;
+    }
+
+    DECLARE_INFO;
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        return &vm.plainObjectSpace();
+    }
+    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+    {
+        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
+    }
+
+private:
+    JSFSWatcherPrototype(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+        : Base(vm, structure)
+    {
+    }
+
+    void finishCreation(JSC::VM&, JSC::JSGlobalObject*);
+};
+
+extern "C" void FSWatcherClass__finalize(void*);
+
+extern "C" EncodedJSValue FSWatcherPrototype__doClose(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__closeCallback);
+
+extern "C" EncodedJSValue FSWatcherPrototype__hasRef(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__hasRefCallback);
+
+extern "C" EncodedJSValue FSWatcherPrototype__doRef(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__refCallback);
+
+extern "C" EncodedJSValue FSWatcherPrototype__doUnref(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__unrefCallback);
+
+STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSFSWatcherPrototype, JSFSWatcherPrototype::Base);
+
+static const HashTableValue JSFSWatcherPrototypeTableValues[] = {
+    { "close"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__closeCallback, 0 } },
+    { "hasRef"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__hasRefCallback, 0 } },
+    { "ref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__refCallback, 0 } },
+    { "unref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__unrefCallback, 0 } }
+};
+
+const ClassInfo JSFSWatcherPrototype::s_info = { "FSWatcher"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSFSWatcherPrototype) };
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__closeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__doClose(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__hasRefCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__hasRef(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__refCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__doRef(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__unrefCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__doUnref(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+extern "C" void FSWatcherPrototype__listenerSetCachedValue(JSC::EncodedJSValue thisValue, JSC::JSGlobalObject* globalObject, JSC::EncodedJSValue value)
+{
+    auto& vm = globalObject->vm();
+    auto* thisObject = jsCast<JSFSWatcher*>(JSValue::decode(thisValue));
+    thisObject->m_listener.set(vm, thisObject, JSValue::decode(value));
+}
+
+extern "C" EncodedJSValue FSWatcherPrototype__listenerGetCachedValue(JSC::EncodedJSValue thisValue)
+{
+    auto* thisObject = jsCast<JSFSWatcher*>(JSValue::decode(thisValue));
+    return JSValue::encode(thisObject->m_listener.get());
+}
+
+void JSFSWatcherPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
+{
+    Base::finishCreation(vm);
+    reifyStaticProperties(vm, JSFSWatcher::info(), JSFSWatcherPrototypeTableValues, *this);
+    JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+}
+
+extern "C" bool FSWatcher__hasPendingActivity(void* ptr);
+bool JSFSWatcher::hasPendingActivity(void* ctx)
+{
+    return FSWatcher__hasPendingActivity(ctx);
+}
+
+JSFSWatcher::~JSFSWatcher()
+{
+    if (m_ctx) {
+        FSWatcherClass__finalize(m_ctx);
+    }
+}
+void JSFSWatcher::destroy(JSCell* cell)
+{
+    static_cast<JSFSWatcher*>(cell)->JSFSWatcher::~JSFSWatcher();
+}
+
+const ClassInfo JSFSWatcher::s_info = { "FSWatcher"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSFSWatcher) };
+
+void JSFSWatcher::finishCreation(VM& vm)
+{
+    Base::finishCreation(vm);
+    ASSERT(inherits(info()));
+}
+
+JSFSWatcher* JSFSWatcher::create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, void* ctx)
+{
+    JSFSWatcher* ptr = new (NotNull, JSC::allocateCell<JSFSWatcher>(vm)) JSFSWatcher(vm, structure, ctx);
+    ptr->finishCreation(vm);
+    return ptr;
+}
+
+extern "C" void* FSWatcher__fromJS(JSC::EncodedJSValue value)
+{
+    JSC::JSValue decodedValue = JSC::JSValue::decode(value);
+    if (decodedValue.isEmpty() || !decodedValue.isCell())
+        return nullptr;
+
+    JSC::JSCell* cell = decodedValue.asCell();
+    JSFSWatcher* object = JSC::jsDynamicCast<JSFSWatcher*>(cell);
+
+    if (!object)
+        return nullptr;
+
+    return object->wrapped();
+}
+
+extern "C" bool FSWatcher__dangerouslySetPtr(JSC::EncodedJSValue value, void* ptr)
+{
+    JSFSWatcher* object = JSC::jsDynamicCast<JSFSWatcher*>(JSValue::decode(value));
+    if (!object)
+        return false;
+
+    object->m_ctx = ptr;
+    return true;
+}
+
+extern "C" const size_t FSWatcher__ptrOffset = JSFSWatcher::offsetOfWrapped();
+
+void JSFSWatcher::analyzeHeap(JSCell* cell, HeapAnalyzer& analyzer)
+{
+    auto* thisObject = jsCast<JSFSWatcher*>(cell);
+    if (void* wrapped = thisObject->wrapped()) {
+        // if (thisObject->scriptExecutionContext())
+        //     analyzer.setLabelForCell(cell, "url " + thisObject->scriptExecutionContext()->url().string());
+    }
+    Base::analyzeHeap(cell, analyzer);
+}
+
+JSObject* JSFSWatcher::createPrototype(VM& vm, JSDOMGlobalObject* globalObject)
+{
+    return JSFSWatcherPrototype::create(vm, globalObject, JSFSWatcherPrototype::createStructure(vm, globalObject, globalObject->objectPrototype()));
+}
+
+extern "C" EncodedJSValue FSWatcher__create(Zig::GlobalObject* globalObject, void* ptr)
+{
+    auto& vm = globalObject->vm();
+    JSC::Structure* structure = globalObject->JSFSWatcherStructure();
+    JSFSWatcher* instance = JSFSWatcher::create(vm, globalObject, structure, ptr);
+
+    return JSValue::encode(instance);
+}
+
+template<typename Visitor>
+void JSFSWatcher::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    JSFSWatcher* thisObject = jsCast<JSFSWatcher*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    Base::visitChildren(thisObject, visitor);
+    visitor.append(thisObject->m_listener);
+
+    visitor.addOpaqueRoot(thisObject->wrapped());
+}
+
+DEFINE_VISIT_CHILDREN(JSFSWatcher);
+
+template<typename Visitor>
+void JSFSWatcher::visitAdditionalChildren(Visitor& visitor)
+{
+    JSFSWatcher* thisObject = this;
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    visitor.append(thisObject->m_listener);
+
+    visitor.addOpaqueRoot(this->wrapped());
+}
+
+DEFINE_VISIT_ADDITIONAL_CHILDREN(JSFSWatcher);
+
+template<typename Visitor>
+void JSFSWatcher::visitOutputConstraintsImpl(JSCell* cell, Visitor& visitor)
+{
+    JSFSWatcher* thisObject = jsCast<JSFSWatcher*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    thisObject->visitAdditionalChildren<Visitor>(visitor);
+}
+
+DEFINE_VISIT_OUTPUT_CONSTRAINTS(JSFSWatcher);
 class JSFileSystemRouterPrototype final : public JSC::JSNonFinalObject {
 public:
     using Base = JSC::JSNonFinalObject;
@@ -7654,6 +7986,9 @@ JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__utimesCallback);
 extern "C" EncodedJSValue NodeJSFSPrototype__utimesSync(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__utimesSyncCallback);
 
+extern "C" EncodedJSValue NodeJSFSPrototype__watch(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__watchCallback);
+
 extern "C" EncodedJSValue NodeJSFSPrototype__write(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__writeCallback);
 
@@ -7751,6 +8086,7 @@ static const HashTableValue JSNodeJSFSPrototypeTableValues[] = {
     { "unlinkSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__unlinkSyncCallback, 1 } },
     { "utimes"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__utimesCallback, 4 } },
     { "utimesSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__utimesSyncCallback, 3 } },
+    { "watch"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__watchCallback, 3 } },
     { "write"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__writeCallback, 6 } },
     { "writeFile"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__writeFileCallback, 4 } },
     { "writeFileSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__writeFileSyncCallback, 3 } },
@@ -9795,6 +10131,33 @@ JSC_DEFINE_HOST_FUNCTION(NodeJSFSPrototype__utimesSyncCallback, (JSGlobalObject
     return NodeJSFSPrototype__utimesSync(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(NodeJSFSPrototype__watchCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSNodeJSFS* thisObject = jsDynamicCast<JSNodeJSFS*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return NodeJSFSPrototype__watch(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(NodeJSFSPrototype__writeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -16509,6 +16872,9 @@ extern "C" void* TCPSocketClass__construct(JSC::JSGlobalObject*, JSC::CallFrame*
 JSC_DECLARE_CUSTOM_GETTER(jsTCPSocketConstructor);
 extern "C" void TCPSocketClass__finalize(void*);
 
+extern "C" JSC::EncodedJSValue TCPSocketPrototype__getALPNProtocol(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
+JSC_DECLARE_CUSTOM_GETTER(TCPSocketPrototype__alpnProtocolGetterWrap);
+
 extern "C" JSC::EncodedJSValue TCPSocketPrototype__getAuthorized(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TCPSocketPrototype__authorizedGetterWrap);
 
@@ -16545,6 +16911,9 @@ JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__reloadCallback);
 extern "C" JSC::EncodedJSValue TCPSocketPrototype__getRemoteAddress(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TCPSocketPrototype__remoteAddressGetterWrap);
 
+extern "C" EncodedJSValue TCPSocketPrototype__setServername(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__setServernameCallback);
+
 extern "C" EncodedJSValue TCPSocketPrototype__shutdown(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__shutdownCallback);
 
@@ -16554,12 +16923,16 @@ JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__timeoutCallback);
 extern "C" EncodedJSValue TCPSocketPrototype__unref(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__unrefCallback);
 
+extern "C" EncodedJSValue TCPSocketPrototype__upgradeTLS(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__upgradeTLSCallback);
+
 extern "C" EncodedJSValue TCPSocketPrototype__write(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__writeCallback);
 
 STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTCPSocketPrototype, JSTCPSocketPrototype::Base);
 
 static const HashTableValue JSTCPSocketPrototypeTableValues[] = {
+    { "alpnProtocol"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__alpnProtocolGetterWrap, 0 } },
     { "authorized"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__authorizedGetterWrap, 0 } },
     { "data"_s, static_cast<unsigned>(JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__dataGetterWrap, TCPSocketPrototype__dataSetterWrap } },
     { "end"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__endCallback, 3 } },
@@ -16571,9 +16944,11 @@ static const HashTableValue JSTCPSocketPrototypeTableValues[] = {
     { "ref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__refCallback, 0 } },
     { "reload"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__reloadCallback, 1 } },
     { "remoteAddress"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__remoteAddressGetterWrap, 0 } },
+    { "setServername"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__setServernameCallback, 1 } },
     { "shutdown"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__shutdownCallback, 1 } },
     { "timeout"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__timeoutCallback, 1 } },
     { "unref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__unrefCallback, 0 } },
+    { "upgradeTLS"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__upgradeTLSCallback, 1 } },
     { "write"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__writeCallback, 3 } }
 };
 
@@ -16591,6 +16966,18 @@ JSC_DEFINE_CUSTOM_GETTER(jsTCPSocketConstructor, (JSGlobalObject * lexicalGlobal
     return JSValue::encode(globalObject->JSTCPSocketConstructor());
 }
 
+JSC_DEFINE_CUSTOM_GETTER(TCPSocketPrototype__alpnProtocolGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
+{
+    auto& vm = lexicalGlobalObject->vm();
+    Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    JSTCPSocket* thisObject = jsCast<JSTCPSocket*>(JSValue::decode(thisValue));
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+    JSC::EncodedJSValue result = TCPSocketPrototype__getALPNProtocol(thisObject->wrapped(), globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+    RELEASE_AND_RETURN(throwScope, result);
+}
+
 JSC_DEFINE_CUSTOM_GETTER(TCPSocketPrototype__authorizedGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -16847,6 +17234,33 @@ extern "C" EncodedJSValue TCPSocketPrototype__remoteAddressGetCachedValue(JSC::E
     return JSValue::encode(thisObject->m_remoteAddress.get());
 }
 
+JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__setServernameCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTCPSocket* thisObject = jsDynamicCast<JSTCPSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TCPSocketPrototype__setServername(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__shutdownCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -16928,6 +17342,33 @@ JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__unrefCallback, (JSGlobalObject * le
     return TCPSocketPrototype__unref(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__upgradeTLSCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTCPSocket* thisObject = jsDynamicCast<JSTCPSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TCPSocketPrototype__upgradeTLS(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__writeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -17116,6 +17557,9 @@ extern "C" void* TLSSocketClass__construct(JSC::JSGlobalObject*, JSC::CallFrame*
 JSC_DECLARE_CUSTOM_GETTER(jsTLSSocketConstructor);
 extern "C" void TLSSocketClass__finalize(void*);
 
+extern "C" JSC::EncodedJSValue TLSSocketPrototype__getALPNProtocol(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
+JSC_DECLARE_CUSTOM_GETTER(TLSSocketPrototype__alpnProtocolGetterWrap);
+
 extern "C" JSC::EncodedJSValue TLSSocketPrototype__getAuthorized(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TLSSocketPrototype__authorizedGetterWrap);
 
@@ -17152,6 +17596,9 @@ JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__reloadCallback);
 extern "C" JSC::EncodedJSValue TLSSocketPrototype__getRemoteAddress(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TLSSocketPrototype__remoteAddressGetterWrap);
 
+extern "C" EncodedJSValue TLSSocketPrototype__setServername(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__setServernameCallback);
+
 extern "C" EncodedJSValue TLSSocketPrototype__shutdown(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__shutdownCallback);
 
@@ -17161,12 +17608,16 @@ JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__timeoutCallback);
 extern "C" EncodedJSValue TLSSocketPrototype__unref(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__unrefCallback);
 
+extern "C" EncodedJSValue TLSSocketPrototype__upgradeTLS(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__upgradeTLSCallback);
+
 extern "C" EncodedJSValue TLSSocketPrototype__write(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__writeCallback);
 
 STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTLSSocketPrototype, JSTLSSocketPrototype::Base);
 
 static const HashTableValue JSTLSSocketPrototypeTableValues[] = {
+    { "alpnProtocol"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__alpnProtocolGetterWrap, 0 } },
     { "authorized"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__authorizedGetterWrap, 0 } },
     { "data"_s, static_cast<unsigned>(JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__dataGetterWrap, TLSSocketPrototype__dataSetterWrap } },
     { "end"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__endCallback, 3 } },
@@ -17178,9 +17629,11 @@ static const HashTableValue JSTLSSocketPrototypeTableValues[] = {
     { "ref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__refCallback, 0 } },
     { "reload"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__reloadCallback, 1 } },
     { "remoteAddress"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__remoteAddressGetterWrap, 0 } },
+    { "setServername"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__setServernameCallback, 1 } },
     { "shutdown"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__shutdownCallback, 1 } },
     { "timeout"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__timeoutCallback, 1 } },
     { "unref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__unrefCallback, 0 } },
+    { "upgradeTLS"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__upgradeTLSCallback, 1 } },
     { "write"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__writeCallback, 3 } }
 };
 
@@ -17198,6 +17651,18 @@ JSC_DEFINE_CUSTOM_GETTER(jsTLSSocketConstructor, (JSGlobalObject * lexicalGlobal
     return JSValue::encode(globalObject->JSTLSSocketConstructor());
 }
 
+JSC_DEFINE_CUSTOM_GETTER(TLSSocketPrototype__alpnProtocolGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
+{
+    auto& vm = lexicalGlobalObject->vm();
+    Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    JSTLSSocket* thisObject = jsCast<JSTLSSocket*>(JSValue::decode(thisValue));
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+    JSC::EncodedJSValue result = TLSSocketPrototype__getALPNProtocol(thisObject->wrapped(), globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+    RELEASE_AND_RETURN(throwScope, result);
+}
+
 JSC_DEFINE_CUSTOM_GETTER(TLSSocketPrototype__authorizedGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -17454,6 +17919,33 @@ extern "C" EncodedJSValue TLSSocketPrototype__remoteAddressGetCachedValue(JSC::E
     return JSValue::encode(thisObject->m_remoteAddress.get());
 }
 
+JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__setServernameCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTLSSocket* thisObject = jsDynamicCast<JSTLSSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TLSSocketPrototype__setServername(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__shutdownCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -17535,6 +18027,33 @@ JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__unrefCallback, (JSGlobalObject * le
     return TLSSocketPrototype__unref(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__upgradeTLSCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTLSSocket* thisObject = jsDynamicCast<JSTLSSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TLSSocketPrototype__upgradeTLS(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__writeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
diff --git a/src/bun.js/bindings/ZigGeneratedClasses.h b/src/bun.js/bindings/ZigGeneratedClasses.h
index 668cd3f6b..1631f960e 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses.h
@@ -578,6 +578,89 @@ public:
     mutable JSC::WriteBarrier<JSC::Unknown> m_testValue;
 };
 
+class JSFSWatcher final : public JSC::JSDestructibleObject {
+public:
+    using Base = JSC::JSDestructibleObject;
+    static JSFSWatcher* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, void* ctx);
+
+    DECLARE_EXPORT_INFO;
+    template<typename, JSC::SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+            return nullptr;
+        return WebCore::subspaceForImpl<JSFSWatcher, WebCore::UseCustomHeapCellType::No>(
+            vm,
+            [](auto& spaces) { return spaces.m_clientSubspaceForFSWatcher.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForFSWatcher = std::forward<decltype(space)>(space); },
+            [](auto& spaces) { return spaces.m_subspaceForFSWatcher.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_subspaceForFSWatcher = std::forward<decltype(space)>(space); });
+    }
+
+    static void destroy(JSC::JSCell*);
+    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+    {
+        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(static_cast<JSC::JSType>(0b11101110), StructureFlags), info());
+    }
+
+    static JSObject* createPrototype(VM& vm, JSDOMGlobalObject* globalObject);
+    ;
+
+    ~JSFSWatcher();
+
+    void* wrapped() const { return m_ctx; }
+
+    void detach()
+    {
+        m_ctx = nullptr;
+    }
+
+    static void analyzeHeap(JSCell*, JSC::HeapAnalyzer&);
+    static ptrdiff_t offsetOfWrapped() { return OBJECT_OFFSETOF(JSFSWatcher, m_ctx); }
+
+    void* m_ctx { nullptr };
+
+    JSFSWatcher(JSC::VM& vm, JSC::Structure* structure, void* sinkPtr)
+        : Base(vm, structure)
+    {
+        m_ctx = sinkPtr;
+        m_weakThis = JSC::Weak<JSFSWatcher>(this, getOwner());
+    }
+
+    void finishCreation(JSC::VM&);
+
+    JSC::Weak<JSFSWatcher> m_weakThis;
+
+    static bool hasPendingActivity(void* ctx);
+
+    class Owner final : public JSC::WeakHandleOwner {
+    public:
+        bool isReachableFromOpaqueRoots(JSC::Handle<JSC::Unknown> handle, void* context, JSC::AbstractSlotVisitor& visitor, const char** reason) final
+        {
+            auto* controller = JSC::jsCast<JSFSWatcher*>(handle.slot()->asCell());
+            if (JSFSWatcher::hasPendingActivity(controller->wrapped())) {
+                if (UNLIKELY(reason))
+                    *reason = "has pending activity";
+                return true;
+            }
+
+            return visitor.containsOpaqueRoot(context);
+        }
+        void finalize(JSC::Handle<JSC::Unknown>, void* context) final {}
+    };
+
+    static JSC::WeakHandleOwner* getOwner()
+    {
+        static NeverDestroyed<Owner> m_owner;
+        return &m_owner.get();
+    }
+
+    DECLARE_VISIT_CHILDREN;
+    template<typename Visitor> void visitAdditionalChildren(Visitor&);
+    DECLARE_VISIT_OUTPUT_CONSTRAINTS;
+
+    mutable JSC::WriteBarrier<JSC::Unknown> m_listener;
+};
+
 class JSFileSystemRouter final : public JSC::JSDestructibleObject {
 public:
     using Base = JSC::JSDestructibleObject;
diff --git a/src/bun.js/bindings/ZigGlobalObject.cpp b/src/bun.js/bindings/ZigGlobalObject.cpp
index e49b94687..91d365af6 100644
--- a/src/bun.js/bindings/ZigGlobalObject.cpp
+++ b/src/bun.js/bindings/ZigGlobalObject.cpp
@@ -181,6 +181,8 @@ namespace JSCastingHelpers = JSC::JSCastingHelpers;
 #include "DOMWrapperWorld-class.h"
 #include "CommonJSModuleRecord.h"
 #include <wtf/RAMSize.h>
+#include <wtf/text/Base64.h>
+#include "simdutf.h"
 
 constexpr size_t DEFAULT_ERROR_STACK_TRACE_LIMIT = 10;
 
@@ -194,6 +196,24 @@ constexpr size_t DEFAULT_ERROR_STACK_TRACE_LIMIT = 10;
 // #include <iostream>
 static bool has_loaded_jsc = false;
 
+namespace WebCore {
+class Base64Utilities {
+public:
+    static ExceptionOr<String> atob(const String& encodedString)
+    {
+        if (encodedString.isNull())
+            return String();
+
+        auto decodedData = base64Decode(encodedString, Base64DecodeMode::DefaultValidatePaddingAndIgnoreWhitespace);
+        if (!decodedData)
+            return Exception { InvalidCharacterError };
+
+        return String(decodedData->data(), decodedData->size());
+    }
+};
+
+}
+
 extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(const char* ptr, size_t length))
 {
     if (has_loaded_jsc)
@@ -219,7 +239,9 @@ extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(c
         JSC::Options::useJITCage() = false;
         JSC::Options::useShadowRealm() = true;
         JSC::Options::useResizableArrayBuffer() = true;
+#ifdef BUN_DEBUG
         JSC::Options::showPrivateScriptsInStackTraces() = true;
+#endif
         JSC::Options::useSetMethods() = true;
 
         /*
@@ -280,7 +302,13 @@ extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(c
         // crypto.createHash("sha1")    985.26 ns/iter    (956.7 ns … 1.12 µs)      1 µs   1.12 µs   1.12 µs
         // Peak memory usage: 56 MB
         size_t ramSize = WTF::ramSize();
-        ramSize /= 1024;
+
+        // We originally went with a hardcoded /= 1024 here
+        // But if you don't have much memory, that becomes a problem.
+        // Instead, we do 65%
+        double ramSizeDouble = static_cast<double>(ramSize);
+        ramSizeDouble *= 0.65;
+        ramSize = static_cast<size_t>(ramSizeDouble);
 
         if (ramSize > 0) {
             JSC::Options::forceRAMSize() = ramSize;
@@ -306,6 +334,140 @@ extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(c
 }
 
 extern "C" void* Bun__getVM();
+extern "C" JSGlobalObject* Bun__getDefaultGlobal();
+
+// Error.captureStackTrace may cause computeErrorInfo to be called twice
+// Rather than figure out the plumbing in JSC, we just skip the next call
+// TODO: thread_local for workers
+static bool skipNextComputeErrorInfo = false;
+
+// error.stack calls this function
+static String computeErrorInfoWithoutPrepareStackTrace(JSC::VM& vm, Vector<StackFrame>& stackTrace, unsigned& line, unsigned& column, String& sourceURL, JSObject* errorInstance)
+{
+    if (!errorInstance) {
+        return String();
+    }
+
+    if (skipNextComputeErrorInfo) {
+        return String();
+    }
+
+    Zig::GlobalObject* globalObject = jsDynamicCast<Zig::GlobalObject*>(errorInstance->globalObject());
+    if (!globalObject) {
+        // Happens in node:vm
+        globalObject = jsDynamicCast<Zig::GlobalObject*>(Bun__getDefaultGlobal());
+    }
+
+    WTF::String name = "Error"_s;
+    WTF::String message;
+
+    if (errorInstance) {
+        // Note that we are not allowed to allocate memory in here. It's called inside a finalizer.
+        if (auto* instance = jsDynamicCast<ErrorInstance*>(errorInstance)) {
+            name = instance->sanitizedNameString(globalObject);
+            message = instance->sanitizedMessageString(globalObject);
+        }
+    }
+
+    WTF::StringBuilder sb;
+
+    if (!name.isEmpty()) {
+        sb.append(name);
+        sb.append(": "_s);
+    }
+
+    if (!message.isEmpty()) {
+        sb.append(message);
+    }
+
+    if (stackTrace.isEmpty()) {
+        return sb.toString();
+    }
+
+    if ((!message.isEmpty() || !name.isEmpty())) {
+        sb.append("\n"_s);
+    }
+
+    size_t framesCount = stackTrace.size();
+    ZigStackFrame remappedFrames[framesCount];
+    bool hasSet = false;
+    for (size_t i = 0; i < framesCount; i++) {
+        StackFrame& frame = stackTrace.at(i);
+
+        sb.append("    at "_s);
+
+        WTF::String functionName = frame.functionName(vm);
+
+        if (auto codeblock = frame.codeBlock()) {
+            if (codeblock->isConstructor()) {
+                sb.append("new "_s);
+            }
+
+            // TODO: async
+        }
+
+        if (functionName.isEmpty()) {
+            sb.append("<anonymous>"_s);
+        } else {
+            sb.append(functionName);
+        }
+
+        sb.append(" ("_s);
+
+        if (frame.hasLineAndColumnInfo()) {
+            unsigned int thisLine = 0;
+            unsigned int thisColumn = 0;
+            frame.computeLineAndColumn(thisLine, thisColumn);
+            remappedFrames[i].position.line = thisLine;
+            remappedFrames[i].position.column_start = thisColumn;
+            String sourceURLForFrame = frame.sourceURL(vm);
+
+            if (!sourceURLForFrame.isEmpty()) {
+                remappedFrames[i].source_url = Bun::toString(sourceURLForFrame);
+            } else {
+                // https://github.com/oven-sh/bun/issues/3595
+                remappedFrames[i].source_url = BunStringEmpty;
+            }
+
+            // This ensures the lifetime of the sourceURL is accounted for correctly
+            Bun__remapStackFramePositions(globalObject, remappedFrames + i, 1);
+
+            if (!hasSet) {
+                hasSet = true;
+                line = thisLine;
+                column = thisColumn;
+                sourceURL = frame.sourceURL(vm);
+
+                if (errorInstance) {
+                    if (remappedFrames[i].remapped) {
+                        errorInstance->putDirect(vm, Identifier::fromString(vm, "originalLine"_s), jsNumber(thisLine), 0);
+                        errorInstance->putDirect(vm, Identifier::fromString(vm, "originalColumn"_s), jsNumber(thisColumn), 0);
+                    }
+                }
+            }
+
+            sb.append(sourceURLForFrame);
+            sb.append(":"_s);
+            sb.append(remappedFrames[i].position.line);
+            sb.append(":"_s);
+            sb.append(remappedFrames[i].position.column_start);
+        } else {
+            sb.append("native"_s);
+        }
+        sb.append(")"_s);
+
+        if (i != framesCount - 1) {
+            sb.append("\n"_s);
+        }
+    }
+
+    return sb.toString();
+}
+
+static String computeErrorInfo(JSC::VM& vm, Vector<StackFrame>& stackTrace, unsigned& line, unsigned& column, String& sourceURL, JSObject* errorInstance)
+{
+    return computeErrorInfoWithoutPrepareStackTrace(vm, stackTrace, line, column, sourceURL, errorInstance);
+}
 
 extern "C" JSC__JSGlobalObject* Zig__GlobalObject__create(JSClassRef* globalObjectClass, int count,
     void* console_client)
@@ -323,6 +485,9 @@ extern "C" JSC__JSGlobalObject* Zig__GlobalObject__create(JSClassRef* globalObje
     Zig::GlobalObject* globalObject = Zig::GlobalObject::create(vm, Zig::GlobalObject::createStructure(vm, JSC::JSGlobalObject::create(vm, JSC::JSGlobalObject::createStructure(vm, JSC::jsNull())), JSC::jsNull()));
     globalObject->setConsole(globalObject);
     globalObject->isThreadLocalDefaultGlobalObject = true;
+    globalObject->setStackTraceLimit(DEFAULT_ERROR_STACK_TRACE_LIMIT); // Node.js defaults to 10
+    vm.setOnComputeErrorInfo(computeErrorInfo);
+
     if (count > 0) {
         globalObject->installAPIGlobals(globalObjectClass, count, vm);
     }
@@ -361,8 +526,8 @@ JSC_DEFINE_HOST_FUNCTION(functionFulfillModuleSync,
         &specifier,
         &specifier);
 
-    if (result.isUndefined() || !result) {
-        return JSValue::encode(result);
+    if (scope.exception() || !result) {
+        RELEASE_AND_RETURN(scope, JSValue::encode(JSC::jsUndefined()));
     }
 
     globalObject->moduleLoader()->provideFetch(globalObject, key, jsCast<JSC::JSSourceCode*>(result)->sourceCode());
@@ -869,6 +1034,21 @@ JSC_DEFINE_HOST_FUNCTION(functionBunSleepThenCallback,
     return JSC::JSValue::encode(promise);
 }
 
+using MicrotaskCallback = void (*)(void*);
+
+JSC_DEFINE_HOST_FUNCTION(functionNativeMicrotaskTrampoline,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    // Do not use JSCell* here because the GC will try to visit it.
+    double cellPtr = callFrame->uncheckedArgument(0).asNumber();
+    double callbackPtr = callFrame->uncheckedArgument(1).asNumber();
+
+    void* cell = reinterpret_cast<void*>(bitwise_cast<uintptr_t>(cellPtr));
+    auto* callback = reinterpret_cast<MicrotaskCallback>(bitwise_cast<uintptr_t>(callbackPtr));
+    callback(cell);
+    return JSValue::encode(jsUndefined());
+}
+
 JSC_DEFINE_HOST_FUNCTION(functionBunSleep,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
@@ -1032,53 +1212,69 @@ JSC_DEFINE_HOST_FUNCTION(functionBTOA,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
 
     if (callFrame->argumentCount() == 0) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        JSC::throwTypeError(globalObject, scope, "btoa requires 1 argument (a string)"_s);
+        JSC::throwTypeError(globalObject, throwScope, "btoa requires 1 argument (a string)"_s);
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    const String& stringToEncode = callFrame->argument(0).toWTFString(globalObject);
+    JSValue arg0 = callFrame->uncheckedArgument(0);
+    WTF::String encodedString = arg0.toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
 
-    if (!stringToEncode || stringToEncode.isNull()) {
-        return JSC::JSValue::encode(JSC::jsString(vm, WTF::String()));
+    if (encodedString.isEmpty()) {
+        return JSC::JSValue::encode(JSC::jsEmptyString(vm));
     }
 
-    if (!stringToEncode.isAllLatin1()) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        throwException(globalObject, scope, createDOMException(globalObject, ExceptionCode::InvalidCharacterError));
+    if (!encodedString.isAllLatin1()) {
+        throwException(globalObject, throwScope, createDOMException(globalObject, InvalidCharacterError));
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    return JSC::JSValue::encode(JSC::jsString(vm, WTF::base64EncodeToString(stringToEncode.latin1())));
+    // Reminder: btoa() is for Byte Strings
+    // Specifically: latin1 byte strings
+    // That means even though this looks like the wrong thing to do,
+    // we should be converting to latin1, not utf8.
+    if (!encodedString.is8Bit()) {
+        LChar* ptr;
+        unsigned length = encodedString.length();
+        auto dest = WTF::String::createUninitialized(length, ptr);
+        WTF::StringImpl::copyCharacters(ptr, encodedString.characters16(), length);
+        encodedString = WTFMove(dest);
+    }
+
+    unsigned length = encodedString.length();
+    RELEASE_AND_RETURN(
+        throwScope,
+        Bun__encoding__toString(
+            encodedString.characters8(),
+            length,
+            globalObject,
+            static_cast<uint8_t>(WebCore::BufferEncodingType::base64)));
 }
 
 static JSC_DEFINE_HOST_FUNCTION(functionATOB,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
 
     if (callFrame->argumentCount() == 0) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        JSC::throwTypeError(globalObject, scope, "atob requires 1 argument (a string)"_s);
+        JSC::throwTypeError(globalObject, throwScope, "atob requires 1 argument (a string)"_s);
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    const WTF::String& encodedString = callFrame->argument(0).toWTFString(globalObject);
-
-    if (encodedString.isNull()) {
-        return JSC::JSValue::encode(JSC::jsEmptyString(vm));
-    }
+    WTF::String encodedString = callFrame->uncheckedArgument(0).toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
 
-    auto decodedData = WTF::base64Decode(encodedString, Base64DecodeMode::DefaultValidatePaddingAndIgnoreWhitespace);
-    if (!decodedData) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        throwException(globalObject, scope, createDOMException(globalObject, ExceptionCode::InvalidCharacterError));
+    auto result = WebCore::Base64Utilities::atob(encodedString);
+    if (result.hasException()) {
+        throwException(globalObject, throwScope, createDOMException(*globalObject, result.releaseException()));
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    return JSC::JSValue::encode(JSC::jsString(vm, WTF::String(decodedData->data(), decodedData->size())));
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsString(vm, result.releaseReturnValue())));
 }
 
 static JSC_DEFINE_HOST_FUNCTION(functionHashCode,
@@ -1271,10 +1467,12 @@ JSC_DEFINE_HOST_FUNCTION(functionCallNotImplemented,
 
 // we're trying out a new way to do this lazy loading
 static JSC_DEFINE_HOST_FUNCTION(functionLazyLoad,
-    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+    (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callFrame))
 {
 JSC:
+    Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
     VM& vm = globalObject->vm();
+
     switch (callFrame->argumentCount()) {
     case 0: {
         auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
@@ -1283,13 +1481,6 @@ JSC:
         return JSC::JSValue::encode(JSC::JSValue {});
     }
     default: {
-        static NeverDestroyed<const String> sqliteString(MAKE_STATIC_STRING_IMPL("sqlite"));
-        static NeverDestroyed<const String> bunJSCString(MAKE_STATIC_STRING_IMPL("bun:jsc"));
-        static NeverDestroyed<const String> bunStreamString(MAKE_STATIC_STRING_IMPL("bun:stream"));
-        static NeverDestroyed<const String> noopString(MAKE_STATIC_STRING_IMPL("noop"));
-        static NeverDestroyed<const String> createImportMeta(MAKE_STATIC_STRING_IMPL("createImportMeta"));
-        static NeverDestroyed<const String> masqueradesAsUndefined(MAKE_STATIC_STRING_IMPL("masqueradesAsUndefined"));
-        static NeverDestroyed<const String> vmString(MAKE_STATIC_STRING_IMPL("vm"));
 
         JSC::JSValue moduleName = callFrame->argument(0);
         if (moduleName.isNumber()) {
@@ -1328,24 +1519,24 @@ JSC:
             return JSC::JSValue::encode(JSC::JSValue {});
         }
 
-        if (string == sqliteString) {
+        if (string == "sqlite"_s) {
             return JSC::JSValue::encode(JSSQLStatementConstructor::create(vm, globalObject, JSSQLStatementConstructor::createStructure(vm, globalObject, globalObject->m_functionPrototype.get())));
         }
 
-        if (string == bunJSCString) {
+        if (string == "bun:jsc"_s) {
             return JSC::JSValue::encode(createJSCModule(globalObject));
         }
 
-        if (string == pathToFileURLString) {
+        if (string == "pathToFileURL"_s) {
             return JSValue::encode(
                 JSFunction::create(vm, globalObject, 1, pathToFileURLString, functionPathToFileURL, ImplementationVisibility::Public, NoIntrinsic));
         }
-        if (string == fileURLToPathString) {
+        if (string == "fileURLToPath"_s) {
             return JSValue::encode(
                 JSFunction::create(vm, globalObject, 1, fileURLToPathString, functionFileURLToPath, ImplementationVisibility::Public, NoIntrinsic));
         }
 
-        if (string == bunStreamString) {
+        if (string == "bun:stream"_s) {
             auto* obj = constructEmptyObject(globalObject);
             obj->putDirect(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "BufferList"_s)), reinterpret_cast<Zig::GlobalObject*>(globalObject)->JSBufferList(), 0);
             obj->putDirect(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "ReadableState"_s)), reinterpret_cast<Zig::GlobalObject*>(globalObject)->JSReadableState(), 0);
@@ -1364,16 +1555,16 @@ JSC:
             return JSValue::encode(obj);
         }
 
-        if (string == createImportMeta) {
+        if (string == "createImportMeta"_s) {
             Zig::ImportMetaObject* obj = Zig::ImportMetaObject::create(globalObject, callFrame->argument(1));
             return JSValue::encode(obj);
         }
 
-        if (string == masqueradesAsUndefined) {
+        if (string == "masqueradesAsUndefined"_s) {
             return JSValue::encode(InternalFunction::createFunctionThatMasqueradesAsUndefined(vm, globalObject, 0, String(), functionCallNotImplemented));
         }
 
-        if (string == vmString) {
+        if (string == "vm"_s) {
             auto* obj = constructEmptyObject(globalObject);
             obj->putDirect(
                 vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "Script"_s)),
@@ -1394,7 +1585,22 @@ JSC:
             return JSValue::encode(obj);
         }
 
-        if (UNLIKELY(string == noopString)) {
+        if (string == "vm"_s) {
+            auto* obj = constructEmptyObject(globalObject);
+        }
+
+        if (string == "primordials"_s) {
+            auto sourceOrigin = callFrame->callerSourceOrigin(vm).url();
+            bool isBuiltin = sourceOrigin.protocolIs("builtin"_s);
+            if (!isBuiltin) {
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            auto* obj = globalObject->primordialsObject();
+            return JSValue::encode(obj);
+        }
+
+        if (UNLIKELY(string == "noop"_s)) {
             auto* obj = constructEmptyObject(globalObject);
             obj->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "getterSetter"_s)), JSC::CustomGetterSetter::create(vm, noop_getter, noop_setter), 0);
             Zig::JSFFIFunction* function = Zig::JSFFIFunction::create(vm, reinterpret_cast<Zig::GlobalObject*>(globalObject), 0, String(), functionNoop, JSC::NoIntrinsic);
@@ -2569,7 +2775,32 @@ JSC::JSValue GlobalObject::formatStackTrace(JSC::VM& vm, JSC::JSGlobalObject* le
 
 extern "C" EncodedJSValue JSPasswordObject__create(JSC::JSGlobalObject*, bool);
 
-JSC_DECLARE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace);
+JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncAppendStackTrace, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callFrame))
+{
+    GlobalObject* globalObject = reinterpret_cast<GlobalObject*>(lexicalGlobalObject);
+    JSC::VM& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+
+    JSC::ErrorInstance* source = jsDynamicCast<JSC::ErrorInstance*>(callFrame->argument(0));
+    JSC::ErrorInstance* destination = jsDynamicCast<JSC::ErrorInstance*>(callFrame->argument(1));
+
+    if (!source || !destination) {
+        throwTypeError(lexicalGlobalObject, scope, "First & second argument must be an Error object"_s);
+        return JSC::JSValue::encode(jsUndefined());
+    }
+
+    if (!destination->stackTrace()) {
+        destination->captureStackTrace(vm, globalObject, 1);
+    }
+
+    if (source->stackTrace()) {
+        destination->stackTrace()->appendVector(*source->stackTrace());
+        source->stackTrace()->clear();
+    }
+
+    return JSC::JSValue::encode(jsUndefined());
+}
+
 JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callFrame))
 {
     GlobalObject* globalObject = reinterpret_cast<GlobalObject*>(lexicalGlobalObject);
@@ -2584,18 +2815,15 @@ JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalOb
     JSC::JSObject* errorObject = objectArg.asCell()->getObject();
     JSC::JSValue caller = callFrame->argument(1);
 
+    // We cannot use our ErrorInstance::captureStackTrace() fast path here unfortunately.
+    // We need to return these CallSite array objects which means we need to create them
     JSValue errorValue = lexicalGlobalObject->get(lexicalGlobalObject, vm.propertyNames->Error);
     auto* errorConstructor = jsDynamicCast<JSC::JSObject*>(errorValue);
-
-    size_t stackTraceLimit = DEFAULT_ERROR_STACK_TRACE_LIMIT;
-    if (JSC::JSValue stackTraceLimitProp = errorConstructor->getIfPropertyExists(lexicalGlobalObject, vm.propertyNames->stackTraceLimit)) {
-        if (stackTraceLimitProp.isNumber()) {
-            stackTraceLimit = std::min(std::max(static_cast<size_t>(stackTraceLimitProp.toIntegerOrInfinity(lexicalGlobalObject)), 0ul), 2048ul);
-            if (stackTraceLimit == 0) {
-                stackTraceLimit = 2048;
-            }
-        }
+    size_t stackTraceLimit = globalObject->stackTraceLimit().value();
+    if (stackTraceLimit == 0) {
+        stackTraceLimit = DEFAULT_ERROR_STACK_TRACE_LIMIT;
     }
+
     JSCStackTrace stackTrace = JSCStackTrace::captureCurrentJSStackTrace(globalObject, callFrame, stackTraceLimit, caller);
 
     // Create an (uninitialized) array for our "call sites"
@@ -2619,7 +2847,7 @@ JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalOb
     size_t framesCount = stackTrace.size();
     ZigStackFrame remappedFrames[framesCount];
     for (int i = 0; i < framesCount; i++) {
-        remappedFrames[i].source_url = Zig::toZigString(stackTrace.at(i).sourceURL(), lexicalGlobalObject);
+        remappedFrames[i].source_url = Bun::toString(lexicalGlobalObject, stackTrace.at(i).sourceURL());
         if (JSCStackFrame::SourcePositions* sourcePositions = stackTrace.at(i).getSourcePositions()) {
             remappedFrames[i].position.line = sourcePositions->line.zeroBasedInt();
             remappedFrames[i].position.column_start = sourcePositions->startColumn.zeroBasedInt() + 1;
@@ -2652,13 +2880,27 @@ JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalOb
     JSC::JSValue formattedStackTrace = globalObject->formatStackTrace(vm, lexicalGlobalObject, errorObject, callSites);
     RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({}));
 
+    bool orignialSkipNextComputeErrorInfo = skipNextComputeErrorInfo;
+    skipNextComputeErrorInfo = true;
     if (errorObject->hasProperty(lexicalGlobalObject, vm.propertyNames->stack)) {
+        skipNextComputeErrorInfo = true;
         errorObject->deleteProperty(lexicalGlobalObject, vm.propertyNames->stack);
     }
+    skipNextComputeErrorInfo = orignialSkipNextComputeErrorInfo;
+
     if (formattedStackTrace.isUndefinedOrNull()) {
-        errorObject->putDirect(vm, vm.propertyNames->stack, jsUndefined(), JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::DontEnum);
-    } else {
-        errorObject->putDirect(vm, vm.propertyNames->stack, formattedStackTrace, JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::DontEnum);
+        formattedStackTrace = JSC::jsUndefined();
+    }
+
+    errorObject->putDirect(vm, vm.propertyNames->stack, formattedStackTrace, 0);
+
+    if (auto* instance = jsDynamicCast<JSC::ErrorInstance*>(errorObject)) {
+        // we make a separate copy of the StackTrace unfortunately so that we
+        // can later console.log it without losing the info
+        //
+        // This is not good. We should remove this in the future as it strictly makes this function
+        // already slower than necessary.
+        instance->captureStackTrace(vm, globalObject, 1, false);
     }
 
     RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSValue {}));
@@ -2721,7 +2963,7 @@ void GlobalObject::finishCreation(VM& vm)
             JSC::Structure* structure = globalObject->structureCache().emptyObjectStructureForPrototype(
                 globalObject,
                 globalObject->objectPrototype(),
-                5);
+                3);
             JSC::PropertyOffset offset;
             auto& vm = globalObject->vm();
 
@@ -2735,13 +2977,6 @@ void GlobalObject::finishCreation(VM& vm)
             structure = structure->addPropertyTransition(
                 vm,
                 structure,
-                JSC::Identifier::fromString(vm, "exports"_s),
-                0,
-                offset);
-
-            structure = structure->addPropertyTransition(
-                vm,
-                structure,
                 JSC::Identifier::fromString(vm, "__dirname"_s),
                 0,
                 offset);
@@ -2753,13 +2988,6 @@ void GlobalObject::finishCreation(VM& vm)
                 0,
                 offset);
 
-            structure = structure->addPropertyTransition(
-                vm,
-                structure,
-                JSC::Identifier::fromString(vm, "require"_s),
-                JSC::PropertyAttribute::Function | JSC::PropertyAttribute::Builtin | 0,
-                offset);
-
             init.set(structure);
         });
 
@@ -2832,6 +3060,11 @@ void GlobalObject::finishCreation(VM& vm)
             init.set(JSFunction::create(init.vm, init.owner, 4, "performMicrotaskVariadic"_s, jsFunctionPerformMicrotaskVariadic, ImplementationVisibility::Public));
         });
 
+    m_nativeMicrotaskTrampoline.initLater(
+        [](const Initializer<JSFunction>& init) {
+            init.set(JSFunction::create(init.vm, init.owner, 2, ""_s, functionNativeMicrotaskTrampoline, ImplementationVisibility::Public));
+        });
+
     m_navigatorObject.initLater(
         [](const Initializer<JSObject>& init) {
             int cpuCount = 0;
@@ -2945,11 +3178,7 @@ void GlobalObject::finishCreation(VM& vm)
             Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(init.owner);
             auto* process = Zig::Process::create(
                 *globalObject, Zig::Process::createStructure(init.vm, init.owner, WebCore::JSEventEmitter::prototype(init.vm, *globalObject)));
-            process->putDirectCustomAccessor(init.vm, JSC::Identifier::fromString(init.vm, "env"_s),
-                JSC::CustomGetterSetter::create(init.vm, lazyProcessEnvGetter, lazyProcessEnvSetter),
-                JSC::PropertyAttribute::DontDelete
-                    | JSC::PropertyAttribute::CustomValue
-                    | 0);
+
             init.set(process);
         });
 
@@ -2979,14 +3208,20 @@ void GlobalObject::finishCreation(VM& vm)
             init.set(structure);
         });
 
-    m_requireResolveFunctionStructure.initLater(
-        [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::Structure>::Initializer& init) {
-            init.set(Zig::ImportMetaObject::createResolveFunctionStructure(init.vm, jsCast<Zig::GlobalObject*>(init.owner)));
+    m_importMetaRequireFunctionUnbound.initLater(
+        [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::JSObject>::Initializer& init) {
+            init.set(
+                Zig::ImportMetaObject::createRequireFunctionUnbound(init.vm, init.owner));
         });
-
-    m_resolveFunctionPrototype.initLater(
+    m_importMetaRequireResolveFunctionUnbound.initLater(
         [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::JSObject>::Initializer& init) {
-            init.set(Zig::ImportMetaObject::createResolveFunctionPrototype(init.vm, jsCast<Zig::GlobalObject*>(init.owner)).getObject());
+            init.set(
+                Zig::ImportMetaObject::createRequireResolveFunctionUnbound(init.vm, init.owner));
+        });
+
+    m_importMetaObjectStructure.initLater(
+        [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::Structure>::Initializer& init) {
+            init.set(Zig::ImportMetaObject::createStructure(init.vm, init.owner));
         });
 
     m_JSFileSinkClassStructure.initLater(
@@ -3110,11 +3345,8 @@ void GlobalObject::finishCreation(VM& vm)
     RELEASE_ASSERT(classInfo());
 
     JSC::JSObject* errorConstructor = this->errorConstructor();
-    errorConstructor->putDirectNativeFunctionWithoutTransition(vm, this, JSC::Identifier::fromString(vm, "captureStackTrace"_s), 2, errorConstructorFuncCaptureStackTrace, ImplementationVisibility::Public, JSC::NoIntrinsic, PropertyAttribute::DontEnum | 0);
-
-    // JSC default is 100
-    errorConstructor->putDirect(vm, vm.propertyNames->stackTraceLimit, jsNumber(DEFAULT_ERROR_STACK_TRACE_LIMIT), JSC::PropertyAttribute::DontEnum | 0);
-
+    errorConstructor->putDirectNativeFunction(vm, this, JSC::Identifier::fromString(vm, "captureStackTrace"_s), 2, errorConstructorFuncCaptureStackTrace, ImplementationVisibility::Public, JSC::NoIntrinsic, PropertyAttribute::DontEnum | 0);
+    errorConstructor->putDirectNativeFunction(vm, this, JSC::Identifier::fromString(vm, "appendStackTrace"_s), 2, errorConstructorFuncAppendStackTrace, ImplementationVisibility::Private, JSC::NoIntrinsic, PropertyAttribute::DontEnum | 0);
     JSC::JSValue console = this->get(this, JSC::Identifier::fromString(vm, "console"_s));
     JSC::JSObject* consoleObject = console.getObject();
     consoleObject->putDirectBuiltinFunction(vm, this, vm.propertyNames->asyncIteratorSymbol, consoleObjectAsyncIteratorCodeGenerator(vm), PropertyAttribute::Builtin | PropertyAttribute::DontDelete);
@@ -3434,7 +3666,7 @@ void GlobalObject::addBuiltinGlobals(JSC::VM& vm)
     auto& builtinNames = WebCore::builtinNames(vm);
 
     WTF::Vector<GlobalPropertyInfo> extraStaticGlobals;
-    extraStaticGlobals.reserveCapacity(43);
+    extraStaticGlobals.reserveCapacity(44);
 
     JSC::Identifier queueMicrotaskIdentifier = JSC::Identifier::fromString(vm, "queueMicrotask"_s);
     extraStaticGlobals.uncheckedAppend(
@@ -3563,7 +3795,7 @@ void GlobalObject::addBuiltinGlobals(JSC::VM& vm)
     putDirectBuiltinFunction(vm, this, builtinNames.loadCJS2ESMPrivateName(), importMetaObjectLoadCJS2ESMCodeGenerator(vm), PropertyAttribute::Builtin | PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly);
     putDirectBuiltinFunction(vm, this, builtinNames.internalRequirePrivateName(), importMetaObjectInternalRequireCodeGenerator(vm), PropertyAttribute::Builtin | PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly);
     putDirectNativeFunction(vm, this, builtinNames.createUninitializedArrayBufferPrivateName(), 1, functionCreateUninitializedArrayBuffer, ImplementationVisibility::Public, NoIntrinsic, PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly | PropertyAttribute::Function);
-    putDirectNativeFunction(vm, this, builtinNames.resolveSyncPrivateName(), 1, functionImportMeta__resolveSync, ImplementationVisibility::Public, NoIntrinsic, PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly | PropertyAttribute::Function);
+    putDirectNativeFunction(vm, this, builtinNames.resolveSyncPrivateName(), 1, functionImportMeta__resolveSyncPrivate, ImplementationVisibility::Public, NoIntrinsic, PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly | PropertyAttribute::Function);
 
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "process"_s), JSC::CustomGetterSetter::create(vm, property_lazyProcessGetter, property_lazyProcessSetter),
         JSC::PropertyAttribute::CustomAccessor | 0);
@@ -3655,30 +3887,44 @@ void GlobalObject::addBuiltinGlobals(JSC::VM& vm)
     PUT_WEBCORE_GENERATED_CONSTRUCTOR("Headers"_s, JSFetchHeaders);
     PUT_WEBCORE_GENERATED_CONSTRUCTOR("URLSearchParams"_s, JSURLSearchParams);
 
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableByteStreamControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBRequestPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultWriterPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().AbortSignalPrivateName(), CustomGetterSetter::create(vm, JSDOMAbortSignal_getter, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableByteStreamControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBRequestPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultWriterPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.ReadableByteStreamControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBRequestPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultWriterPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.AbortSignalPrivateName(), CustomGetterSetter::create(vm, JSDOMAbortSignal_getter, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableByteStreamControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBRequestPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultWriterPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+
+    putDirectNativeFunction(vm, this,
+        builtinNames.createCommonJSModulePrivateName(),
+        2,
+        Bun::jsFunctionCreateCommonJSModule,
+        ImplementationVisibility::Public,
+        NoIntrinsic,
+        JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DontDelete | 0);
+    putDirectNativeFunction(vm, this,
+        builtinNames.evaluateCommonJSModulePrivateName(),
+        2,
+        Bun::jsFunctionLoadModule,
+        ImplementationVisibility::Public,
+        NoIntrinsic,
+        JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DontDelete | 0);
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "ByteLengthQueuingStrategy"_s), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ByteLengthQueuingStrategyConstructor, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "CountQueuingStrategy"_s), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_CountQueuingStrategyConstructor, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "SubtleCrypto"_s), JSC::CustomGetterSetter::create(vm, getterSubtleCryptoConstructor, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
@@ -4013,6 +4259,7 @@ void GlobalObject::visitChildrenImpl(JSCell* cell, Visitor& visitor)
     thisObject->m_JSFileSinkControllerPrototype.visit(visitor);
     thisObject->m_JSHTTPSResponseControllerPrototype.visit(visitor);
     thisObject->m_navigatorObject.visit(visitor);
+    thisObject->m_nativeMicrotaskTrampoline.visit(visitor);
     thisObject->m_performanceObject.visit(visitor);
     thisObject->m_primordialsObject.visit(visitor);
     thisObject->m_processEnvObject.visit(visitor);
@@ -4023,8 +4270,10 @@ void GlobalObject::visitChildrenImpl(JSCell* cell, Visitor& visitor)
     thisObject->m_emitReadableNextTickFunction.visit(visitor);
     thisObject->m_JSBufferSubclassStructure.visit(visitor);
 
-    thisObject->m_requireResolveFunctionStructure.visit(visitor);
-    thisObject->m_resolveFunctionPrototype.visit(visitor);
+    thisObject->m_importMetaRequireFunctionUnbound.visit(visitor);
+    thisObject->m_importMetaRequireResolveFunctionUnbound.visit(visitor);
+    thisObject->m_importMetaObjectStructure.visit(visitor);
+
     thisObject->m_dnsObject.visit(visitor);
     thisObject->m_lazyRequireCacheObject.visit(visitor);
     thisObject->m_vmModuleContextMap.visit(visitor);
@@ -4173,6 +4422,14 @@ extern "C" void JSC__JSGlobalObject__reload(JSC__JSGlobalObject* arg0)
     globalObject->reload();
 }
 
+extern "C" void JSC__JSGlobalObject__queueMicrotaskCallback(Zig::GlobalObject* globalObject, void* ptr, MicrotaskCallback callback)
+{
+    JSFunction* function = globalObject->nativeMicrotaskTrampoline();
+
+    // Do not use JSCell* here because the GC will try to visit it.
+    globalObject->queueMicrotask(function, JSValue(bitwise_cast<double>(reinterpret_cast<uintptr_t>(ptr))), JSValue(bitwise_cast<double>(reinterpret_cast<uintptr_t>(callback))), jsUndefined(), jsUndefined());
+}
+
 JSC::Identifier GlobalObject::moduleLoaderResolve(JSGlobalObject* globalObject,
     JSModuleLoader* loader, JSValue key,
     JSValue referrer, JSValue origin)
diff --git a/src/bun.js/bindings/ZigGlobalObject.h b/src/bun.js/bindings/ZigGlobalObject.h
index 2d69e764f..f44212da1 100644
--- a/src/bun.js/bindings/ZigGlobalObject.h
+++ b/src/bun.js/bindings/ZigGlobalObject.h
@@ -248,8 +248,8 @@ public:
 
     JSC::JSFunction* emitReadableNextTickFunction() { return m_emitReadableNextTickFunction.getInitializedOnMainThread(this); }
 
-    Structure* requireResolveFunctionStructure() { return m_requireResolveFunctionStructure.getInitializedOnMainThread(this); }
-    JSObject* requireResolveFunctionPrototype() { return m_resolveFunctionPrototype.getInitializedOnMainThread(this); }
+    JSObject* importMetaRequireFunctionUnbound() { return m_importMetaRequireFunctionUnbound.getInitializedOnMainThread(this); }
+    JSObject* importMetaRequireResolveFunctionUnbound() { return m_importMetaRequireResolveFunctionUnbound.getInitializedOnMainThread(this); }
 
     JSObject* lazyRequireCacheObject() { return m_lazyRequireCacheObject.getInitializedOnMainThread(this); }
 
@@ -262,6 +262,7 @@ public:
     JSObject* lazyTestModuleObject() { return m_lazyTestModuleObject.getInitializedOnMainThread(this); }
     JSObject* lazyPreloadTestModuleObject() { return m_lazyPreloadTestModuleObject.getInitializedOnMainThread(this); }
     Structure* CommonJSModuleObjectStructure() { return m_commonJSModuleObjectStructure.getInitializedOnMainThread(this); }
+    Structure* ImportMetaObjectStructure() { return m_importMetaObjectStructure.getInitializedOnMainThread(this); }
 
     Structure* commonJSFunctionArgumentsStructure() { return m_commonJSFunctionArgumentsStructure.getInitializedOnMainThread(this); }
 
@@ -269,6 +270,8 @@ public:
 
     JSWeakMap* vmModuleContextMap() { return m_vmModuleContextMap.getInitializedOnMainThread(this); }
 
+    bool hasProcessObject() const { return m_processObject.isInitialized(); }
+
     JSC::JSObject* processObject()
     {
         return m_processObject.getInitializedOnMainThread(this);
@@ -368,6 +371,7 @@ public:
     mutable WriteBarrier<JSFunction> m_thenables[promiseFunctionsSize + 1];
 
     JSObject* navigatorObject();
+    JSFunction* nativeMicrotaskTrampoline() { return m_nativeMicrotaskTrampoline.getInitializedOnMainThread(this); }
 
     void trackFFIFunction(JSC::JSFunction* function)
     {
@@ -465,6 +469,7 @@ private:
      */
     LazyProperty<JSGlobalObject, JSC::Structure> m_pendingVirtualModuleResultStructure;
     LazyProperty<JSGlobalObject, JSFunction> m_performMicrotaskFunction;
+    LazyProperty<JSGlobalObject, JSFunction> m_nativeMicrotaskTrampoline;
     LazyProperty<JSGlobalObject, JSFunction> m_performMicrotaskVariadicFunction;
     LazyProperty<JSGlobalObject, JSFunction> m_emitReadableNextTickFunction;
     LazyProperty<JSGlobalObject, JSMap> m_lazyReadableStreamPrototypeMap;
@@ -481,8 +486,6 @@ private:
     LazyProperty<JSGlobalObject, JSObject> m_subtleCryptoObject;
     LazyProperty<JSGlobalObject, Structure> m_JSHTTPResponseController;
     LazyProperty<JSGlobalObject, JSC::Structure> m_JSBufferSubclassStructure;
-    LazyProperty<JSGlobalObject, JSC::Structure> m_requireResolveFunctionStructure;
-    LazyProperty<JSGlobalObject, JSObject> m_resolveFunctionPrototype;
     LazyProperty<JSGlobalObject, JSObject> m_dnsObject;
     LazyProperty<JSGlobalObject, JSWeakMap> m_vmModuleContextMap;
     LazyProperty<JSGlobalObject, JSObject> m_lazyRequireCacheObject;
@@ -496,6 +499,10 @@ private:
     LazyProperty<JSGlobalObject, Structure> m_commonJSModuleObjectStructure;
     LazyProperty<JSGlobalObject, Structure> m_commonJSFunctionArgumentsStructure;
 
+    LazyProperty<JSGlobalObject, JSC::JSObject> m_importMetaRequireFunctionUnbound;
+    LazyProperty<JSGlobalObject, JSC::JSObject> m_importMetaRequireResolveFunctionUnbound;
+    LazyProperty<JSGlobalObject, JSC::Structure> m_importMetaObjectStructure;
+
     DOMGuardedObjectSet m_guardedObjects WTF_GUARDED_BY_LOCK(m_gcLock);
     void* m_bunVM;
 
diff --git a/src/bun.js/bindings/ZigSourceProvider.cpp b/src/bun.js/bindings/ZigSourceProvider.cpp
index ab3062cd5..a71e946de 100644
--- a/src/bun.js/bindings/ZigSourceProvider.cpp
+++ b/src/bun.js/bindings/ZigSourceProvider.cpp
@@ -43,39 +43,34 @@ static uintptr_t getSourceProviderMapKey(ResolvedSource& resolvedSource)
     }
 }
 
-Ref<SourceProvider> SourceProvider::create(Zig::GlobalObject* globalObject, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType)
+static SourceOrigin toSourceOrigin(const String& sourceURL, bool isBuiltin)
 {
-
-    uintptr_t providerKey = 0;
-    if (globalObject->isThreadLocalDefaultGlobalObject) {
-        auto& sourceProviderMap = globalObject->sourceProviderMap;
-        providerKey = getSourceProviderMapKey(resolvedSource);
-        if (providerKey) {
-            auto sourceProvider = sourceProviderMap.get(providerKey);
-            if (sourceProvider != nullptr) {
-                sourceProvider->ref();
-                return adoptRef(*reinterpret_cast<Zig::SourceProvider*>(sourceProvider));
-            }
+    if (isBuiltin) {
+        if (sourceURL.startsWith("node:"_s)) {
+            return SourceOrigin(WTF::URL(makeString("builtin://node/", sourceURL.substring(5))));
+        } else if (sourceURL.startsWith("bun:"_s)) {
+            return SourceOrigin(WTF::URL(makeString("builtin://bun/", sourceURL.substring(4))));
+        } else {
+            return SourceOrigin(WTF::URL(makeString("builtin://", sourceURL)));
         }
     }
+
+    return SourceOrigin(WTF::URL::fileURLWithFileSystemPath(sourceURL));
+}
+
+Ref<SourceProvider> SourceProvider::create(Zig::GlobalObject* globalObject, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType, bool isBuiltin)
+{
+
     auto stringImpl = Bun::toWTFString(resolvedSource.source_code);
     auto sourceURLString = toStringCopy(resolvedSource.source_url);
 
-    if (stringImpl.impl()->refCount() > 1)
-        // Deref because we don't call a destructor for BunString
-        stringImpl.impl()->deref();
-
     auto provider = adoptRef(*new SourceProvider(
         globalObject->isThreadLocalDefaultGlobalObject ? globalObject : nullptr,
         resolvedSource, stringImpl.releaseImpl().releaseNonNull(),
-        JSC::SourceOrigin(WTF::URL::fileURLWithFileSystemPath(sourceURLString)),
+        toSourceOrigin(sourceURLString, isBuiltin),
         sourceURLString.impl(), TextPosition(),
         sourceType));
 
-    if (providerKey) {
-        globalObject->sourceProviderMap.set(providerKey, provider.copyRef());
-    }
-
     return provider;
 }
 
@@ -90,11 +85,6 @@ unsigned SourceProvider::hash() const
 
 void SourceProvider::freeSourceCode()
 {
-    if (m_globalObjectForSourceProviderMap) {
-        m_globalObjectForSourceProviderMap->sourceProviderMap.remove((uintptr_t)m_source.get().characters8());
-    }
-
-    m_source = *WTF::StringImpl::empty();
 }
 
 void SourceProvider::updateCache(const UnlinkedFunctionExecutable* executable, const SourceCode&,
diff --git a/src/bun.js/bindings/ZigSourceProvider.h b/src/bun.js/bindings/ZigSourceProvider.h
index dd78b20ae..c189cc454 100644
--- a/src/bun.js/bindings/ZigSourceProvider.h
+++ b/src/bun.js/bindings/ZigSourceProvider.h
@@ -34,7 +34,7 @@ class SourceProvider final : public JSC::SourceProvider {
     using SourceOrigin = JSC::SourceOrigin;
 
 public:
-    static Ref<SourceProvider> create(Zig::GlobalObject*, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType = JSC::SourceProviderSourceType::Module);
+    static Ref<SourceProvider> create(Zig::GlobalObject*, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType = JSC::SourceProviderSourceType::Module, bool isBuiltIn = false);
     ~SourceProvider()
     {
         freeSourceCode();
diff --git a/src/bun.js/bindings/bindings.cpp b/src/bun.js/bindings/bindings.cpp
index 4eee81f4d..d311072e4 100644
--- a/src/bun.js/bindings/bindings.cpp
+++ b/src/bun.js/bindings/bindings.cpp
@@ -679,8 +679,8 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
             return false;
         }
 
-        JSC::PropertyNameArray a1(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Include);
-        JSC::PropertyNameArray a2(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Include);
+        JSC::PropertyNameArray a1(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Exclude);
+        JSC::PropertyNameArray a2(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Exclude);
         JSObject::getOwnPropertyNames(o1, globalObject, a1, DontEnumPropertiesMode::Exclude);
         JSObject::getOwnPropertyNames(o2, globalObject, a2, DontEnumPropertiesMode::Exclude);
 
@@ -753,7 +753,7 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
             }
 
             o1Structure->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
-                if (entry.attributes() & PropertyAttribute::DontEnum) {
+                if (entry.attributes() & PropertyAttribute::DontEnum || PropertyName(entry.key()).isPrivateName()) {
                     return true;
                 }
                 count1++;
@@ -787,7 +787,7 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
             if (result && o2Structure->id() != o1Structure->id()) {
                 size_t remain = count1;
                 o2Structure->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
-                    if (entry.attributes() & PropertyAttribute::DontEnum) {
+                    if (entry.attributes() & PropertyAttribute::DontEnum || PropertyName(entry.key()).isPrivateName()) {
                         return true;
                     }
 
@@ -815,8 +815,8 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
         }
     }
 
-    JSC::PropertyNameArray a1(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Include);
-    JSC::PropertyNameArray a2(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Include);
+    JSC::PropertyNameArray a1(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Exclude);
+    JSC::PropertyNameArray a2(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Exclude);
     o1->getPropertyNames(globalObject, a1, DontEnumPropertiesMode::Exclude);
     o2->getPropertyNames(globalObject, a2, DontEnumPropertiesMode::Exclude);
 
@@ -1279,15 +1279,14 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
     JSC__JSGlobalObject* globalObject)
 {
 
-    static const char* system_error_name = "SystemError";
     SystemError err = *arg0;
 
     JSC::VM& vm = globalObject->vm();
 
     auto scope = DECLARE_THROW_SCOPE(vm);
     JSC::JSValue message = JSC::jsUndefined();
-    if (err.message.len > 0) {
-        message = Zig::toJSString(err.message, globalObject);
+    if (err.message.tag != BunStringTag::Empty) {
+        message = Bun::toJS(globalObject, err.message);
     }
 
     JSC::JSValue options = JSC::jsUndefined();
@@ -1297,8 +1296,8 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
 
     auto clientData = WebCore::clientData(vm);
 
-    if (err.code.len > 0 && !(err.code.len == 1 and err.code.ptr[0] == 0)) {
-        JSC::JSValue code = Zig::toJSStringGC(err.code, globalObject);
+    if (err.code.tag != BunStringTag::Empty) {
+        JSC::JSValue code = Bun::toJS(globalObject, err.code);
         result->putDirect(vm, clientData->builtinNames().codePublicName(), code,
             JSC::PropertyAttribute::DontDelete | 0);
 
@@ -1307,13 +1306,12 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
 
         result->putDirect(
             vm, vm.propertyNames->name,
-            JSC::JSValue(JSC::jsOwnedString(
-                vm, WTF::String(WTF::StringImpl::createWithoutCopying(system_error_name, 11)))),
+            JSC::JSValue(jsString(vm, String("SystemError"_s))),
             JSC::PropertyAttribute::DontEnum | 0);
     }
 
-    if (err.path.len > 0) {
-        JSC::JSValue path = JSC::JSValue(Zig::toJSStringGC(err.path, globalObject));
+    if (err.path.tag != BunStringTag::Empty) {
+        JSC::JSValue path = Bun::toJS(globalObject, err.path);
         result->putDirect(vm, clientData->builtinNames().pathPublicName(), path,
             JSC::PropertyAttribute::DontDelete | 0);
     }
@@ -1324,8 +1322,8 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
             JSC::PropertyAttribute::DontDelete | 0);
     }
 
-    if (err.syscall.len > 0) {
-        JSC::JSValue syscall = JSC::JSValue(Zig::toJSString(err.syscall, globalObject));
+    if (err.syscall.tag != BunStringTag::Empty) {
+        JSC::JSValue syscall = Bun::toJS(globalObject, err.syscall);
         result->putDirect(vm, clientData->builtinNames().syscallPublicName(), syscall,
             JSC::PropertyAttribute::DontDelete | 0);
     }
@@ -2593,6 +2591,12 @@ bool JSC__JSPromise__isHandled(const JSC__JSPromise* arg0, JSC__VM* arg1)
 {
     return arg0->isHandled(reinterpret_cast<JSC::VM&>(arg1));
 }
+void JSC__JSPromise__setHandled(JSC__JSPromise* promise, JSC__VM* arg1)
+{
+    auto& vm = *arg1;
+    auto flags = promise->internalField(JSC::JSPromise::Field::Flags).get().asUInt32();
+    promise->internalField(JSC::JSPromise::Field::Flags).set(vm, promise, jsNumber(flags | JSC::JSPromise::isHandledFlag));
+}
 
 #pragma mark - JSC::JSInternalPromise
 
@@ -2666,6 +2670,12 @@ bool JSC__JSInternalPromise__isHandled(const JSC__JSInternalPromise* arg0, JSC__
 {
     return arg0->isHandled(reinterpret_cast<JSC::VM&>(arg1));
 }
+void JSC__JSInternalPromise__setHandled(JSC__JSInternalPromise* promise, JSC__VM* arg1)
+{
+    auto& vm = *arg1;
+    auto flags = promise->internalField(JSC::JSPromise::Field::Flags).get().asUInt32();
+    promise->internalField(JSC::JSPromise::Field::Flags).set(vm, promise, jsNumber(flags | JSC::JSPromise::isHandledFlag));
+}
 
 #pragma mark - JSC::JSGlobalObject
 
@@ -2765,8 +2775,18 @@ void JSC__JSValue__put(JSC__JSValue JSValue0, JSC__JSGlobalObject* arg1, const Z
 
 bool JSC__JSValue__isClass(JSC__JSValue JSValue0, JSC__JSGlobalObject* arg1)
 {
-    JSC::JSValue value = JSC::JSValue::decode(JSValue0);
-    return value.isConstructor();
+    JSValue value = JSValue::decode(JSValue0);
+    auto callData = getCallData(value);
+
+    switch (callData.type) {
+    case CallData::Type::JS:
+        return callData.js.functionExecutable->isClassConstructorFunction();
+    case CallData::Type::Native:
+        if (callData.native.isBoundFunction)
+            return false;
+        return value.isConstructor();
+    }
+    return false;
 }
 bool JSC__JSValue__isCell(JSC__JSValue JSValue0) { return JSC::JSValue::decode(JSValue0).isCell(); }
 bool JSC__JSValue__isCustomGetterSetter(JSC__JSValue JSValue0)
@@ -3291,7 +3311,8 @@ bool JSC__JSValue__stringIncludes(JSC__JSValue value, JSC__JSGlobalObject* globa
 
 static void populateStackFrameMetadata(JSC::VM& vm, const JSC::StackFrame* stackFrame, ZigStackFrame* frame)
 {
-    frame->source_url = Zig::toZigString(stackFrame->sourceURL(vm));
+
+    frame->source_url = Bun::toStringRef(stackFrame->sourceURL(vm));
 
     if (stackFrame->isWasmFrame()) {
         frame->code_type = ZigStackFrameCodeWasm;
@@ -3328,37 +3349,11 @@ static void populateStackFrameMetadata(JSC::VM& vm, const JSC::StackFrame* stack
 
     JSC::JSObject* callee = JSC::jsCast<JSC::JSObject*>(calleeCell);
 
-    // Does the code block have a user-defined name property?
-    JSC::JSValue name = callee->getDirect(vm, vm.propertyNames->name);
-    if (name && name.isString()) {
-        auto str = name.toWTFString(m_codeBlock->globalObject());
-        frame->function_name = Zig::toZigString(str);
-        return;
-    }
-
-    /* For functions (either JSFunction or InternalFunction), fallback to their "native" name
-     * property. Based on JSC::getCalculatedDisplayName, "inlining" the
-     * JSFunction::calculatedDisplayName\InternalFunction::calculatedDisplayName calls */
-    if (JSC::JSFunction* function = JSC::jsDynamicCast<JSC::JSFunction*>(callee)) {
-
-        WTF::String actualName = function->name(vm);
-        if (!actualName.isEmpty() || function->isHostOrBuiltinFunction()) {
-            frame->function_name = Zig::toZigString(actualName);
-            return;
-        }
-
-        auto inferred_name = function->jsExecutable()->name();
-        frame->function_name = Zig::toZigString(inferred_name.string());
-    }
-
-    if (JSC::InternalFunction* function = JSC::jsDynamicCast<JSC::InternalFunction*>(callee)) {
-        // Based on JSC::InternalFunction::calculatedDisplayName, skipping the "displayName" property
-        frame->function_name = Zig::toZigString(function->name());
-    }
+    frame->function_name = Bun::toStringRef(JSC::getCalculatedDisplayName(vm, callee));
 }
 // Based on
 // https://github.com/mceSystems/node-jsc/blob/master/deps/jscshim/src/shim/JSCStackTrace.cpp#L298
-static void populateStackFramePosition(const JSC::StackFrame* stackFrame, ZigString* source_lines,
+static void populateStackFramePosition(const JSC::StackFrame* stackFrame, BunString* source_lines,
     int32_t* source_line_numbers, uint8_t source_lines_count,
     ZigStackFramePosition* position)
 {
@@ -3428,7 +3423,7 @@ static void populateStackFramePosition(const JSC::StackFrame* stackFrame, ZigStr
 
         // Most of the time, when you look at a stack trace, you want a couple lines above
 
-        source_lines[0] = { &chars[lineStart], lineStop - lineStart };
+        source_lines[0] = Bun::toStringRef(sourceString.substring(lineStart, lineStop - lineStart).toStringWithoutCopying());
         source_line_numbers[0] = line;
 
         if (lineStart > 0) {
@@ -3445,8 +3440,7 @@ static void populateStackFramePosition(const JSC::StackFrame* stackFrame, ZigStr
                 }
 
                 // We are at the beginning of the line
-                source_lines[source_line_i] = { &chars[byte_offset_in_source_string],
-                    end_of_line_offset - byte_offset_in_source_string + 1 };
+                source_lines[source_line_i] = Bun::toStringRef(sourceString.substring(byte_offset_in_source_string, end_of_line_offset - byte_offset_in_source_string + 1).toStringWithoutCopying());
 
                 source_line_numbers[source_line_i] = line - source_line_i;
                 source_line_i++;
@@ -3516,12 +3510,13 @@ static void fromErrorInstance(ZigException* except, JSC::JSGlobalObject* global,
     JSC::JSValue val)
 {
     JSC::JSObject* obj = JSC::jsDynamicCast<JSC::JSObject*>(val);
+    JSC::VM& vm = global->vm();
 
     bool getFromSourceURL = false;
     if (stackTrace != nullptr && stackTrace->size() > 0) {
-        populateStackTrace(global->vm(), *stackTrace, &except->stack);
+        populateStackTrace(vm, *stackTrace, &except->stack);
     } else if (err->stackTrace() != nullptr && err->stackTrace()->size() > 0) {
-        populateStackTrace(global->vm(), *err->stackTrace(), &except->stack);
+        populateStackTrace(vm, *err->stackTrace(), &except->stack);
     } else {
         getFromSourceURL = true;
     }
@@ -3533,33 +3528,35 @@ static void fromErrorInstance(ZigException* except, JSC::JSGlobalObject* global,
         except->code = 8;
     }
     if (except->code == SYNTAX_ERROR_CODE) {
-        except->message = Zig::toZigString(err->sanitizedMessageString(global));
-    } else if (JSC::JSValue message = obj->getIfPropertyExists(global, global->vm().propertyNames->message)) {
+        except->message = Bun::toStringRef(err->sanitizedMessageString(global));
+    } else if (JSC::JSValue message = obj->getIfPropertyExists(global, vm.propertyNames->message)) {
 
-        except->message = Zig::toZigString(message, global);
+        except->message = Bun::toStringRef(global, message);
 
     } else {
-        except->message = Zig::toZigString(err->sanitizedMessageString(global));
+        except->message = Bun::toStringRef(err->sanitizedMessageString(global));
     }
-    except->name = Zig::toZigString(err->sanitizedNameString(global));
+
+    except->name = Bun::toStringRef(err->sanitizedNameString(global));
+
     except->runtime_type = err->runtimeTypeForCause();
 
-    auto clientData = WebCore::clientData(global->vm());
+    auto clientData = WebCore::clientData(vm);
     if (except->code != SYNTAX_ERROR_CODE) {
 
         if (JSC::JSValue syscall = obj->getIfPropertyExists(global, clientData->builtinNames().syscallPublicName())) {
-            except->syscall = Zig::toZigString(syscall, global);
+            except->syscall = Bun::toStringRef(global, syscall);
         }
 
         if (JSC::JSValue code = obj->getIfPropertyExists(global, clientData->builtinNames().codePublicName())) {
-            except->code_ = Zig::toZigString(code, global);
+            except->code_ = Bun::toStringRef(global, code);
         }
 
         if (JSC::JSValue path = obj->getIfPropertyExists(global, clientData->builtinNames().pathPublicName())) {
-            except->path = Zig::toZigString(path, global);
+            except->path = Bun::toStringRef(global, path);
         }
 
-        if (JSC::JSValue fd = obj->getIfPropertyExists(global, Identifier::fromString(global->vm(), "fd"_s))) {
+        if (JSC::JSValue fd = obj->getIfPropertyExists(global, Identifier::fromString(vm, "fd"_s))) {
             if (fd.isAnyInt()) {
                 except->fd = fd.toInt32(global);
             }
@@ -3571,27 +3568,29 @@ static void fromErrorInstance(ZigException* except, JSC::JSGlobalObject* global,
     }
 
     if (getFromSourceURL) {
-        if (JSC::JSValue sourceURL = obj->getIfPropertyExists(global, global->vm().propertyNames->sourceURL)) {
-            except->stack.frames_ptr[0].source_url = Zig::toZigString(sourceURL, global);
+        if (JSC::JSValue sourceURL = obj->getIfPropertyExists(global, vm.propertyNames->sourceURL)) {
+            except->stack.frames_ptr[0].source_url = Bun::toStringRef(global, sourceURL);
 
-            if (JSC::JSValue column = obj->getIfPropertyExists(global, global->vm().propertyNames->column)) {
+            if (JSC::JSValue column = obj->getIfPropertyExists(global, vm.propertyNames->column)) {
                 except->stack.frames_ptr[0].position.column_start = column.toInt32(global);
             }
 
-            if (JSC::JSValue line = obj->getIfPropertyExists(global, global->vm().propertyNames->line)) {
+            if (JSC::JSValue line = obj->getIfPropertyExists(global, vm.propertyNames->line)) {
                 except->stack.frames_ptr[0].position.line = line.toInt32(global);
 
-                if (JSC::JSValue lineText = obj->getIfPropertyExists(global, JSC::Identifier::fromString(global->vm(), "lineText"_s))) {
+                if (JSC::JSValue lineText = obj->getIfPropertyExists(global, JSC::Identifier::fromString(vm, "lineText"_s))) {
                     if (JSC::JSString* jsStr = lineText.toStringOrNull(global)) {
                         auto str = jsStr->value(global);
-                        except->stack.source_lines_ptr[0] = Zig::toZigString(str);
+                        except->stack.source_lines_ptr[0] = Bun::toStringRef(str);
                         except->stack.source_lines_numbers[0] = except->stack.frames_ptr[0].position.line;
                         except->stack.source_lines_len = 1;
                         except->remapped = true;
                     }
                 }
             }
+
             except->stack.frames_len = 1;
+            except->stack.frames_ptr[0].remapped = obj->hasProperty(global, JSC::Identifier::fromString(vm, "originalLine"_s));
         }
     }
 
@@ -3605,7 +3604,7 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
     if (JSC::JSObject* obj = JSC::jsDynamicCast<JSC::JSObject*>(value)) {
         if (obj->hasProperty(global, global->vm().propertyNames->name)) {
             auto name_str = obj->getIfPropertyExists(global, global->vm().propertyNames->name).toWTFString(global);
-            except->name = Zig::toZigString(name_str);
+            except->name = Bun::toStringRef(name_str);
             if (name_str == "Error"_s) {
                 except->code = JSErrorCodeError;
             } else if (name_str == "EvalError"_s) {
@@ -3627,14 +3626,14 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
 
         if (JSC::JSValue message = obj->getIfPropertyExists(global, global->vm().propertyNames->message)) {
             if (message) {
-                except->message = Zig::toZigString(
+                except->message = Bun::toStringRef(
                     message.toWTFString(global));
             }
         }
 
         if (JSC::JSValue sourceURL = obj->getIfPropertyExists(global, global->vm().propertyNames->sourceURL)) {
             if (sourceURL) {
-                except->stack.frames_ptr[0].source_url = Zig::toZigString(
+                except->stack.frames_ptr[0].source_url = Bun::toStringRef(
                     sourceURL.toWTFString(global));
                 except->stack.frames_len = 1;
             }
@@ -3642,7 +3641,12 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
 
         if (JSC::JSValue line = obj->getIfPropertyExists(global, global->vm().propertyNames->line)) {
             if (line) {
-                except->stack.frames_ptr[0].position.line = line.toInt32(global);
+                // TODO: don't sourcemap it twice
+                if (auto originalLine = obj->getIfPropertyExists(global, JSC::Identifier::fromString(global->vm(), "originalLine"_s))) {
+                    except->stack.frames_ptr[0].position.line = originalLine.toInt32(global);
+                } else {
+                    except->stack.frames_ptr[0].position.line = line.toInt32(global);
+                }
                 except->stack.frames_len = 1;
             }
         }
@@ -3658,9 +3662,7 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
     }
     scope.release();
 
-    auto ref = OpaqueJSString::tryCreate(str);
-    except->message = ZigString { ref->characters8(), ref->length() };
-    ref->ref();
+    except->message = Bun::toStringRef(str);
 }
 
 void JSC__VM__releaseWeakRefs(JSC__VM* arg0)
@@ -3770,8 +3772,8 @@ void JSC__JSValue__toZigException(JSC__JSValue JSValue0, JSC__JSGlobalObject* ar
     JSC::JSValue value = JSC::JSValue::decode(JSValue0);
     if (value == JSC::JSValue {}) {
         exception->code = JSErrorCodeError;
-        exception->name = Zig::toZigString("Error"_s);
-        exception->message = Zig::toZigString("Unknown error"_s);
+        exception->name = Bun::toStringRef("Error"_s);
+        exception->message = Bun::toStringRef("Unknown error"_s);
         return;
     }
 
@@ -3900,36 +3902,6 @@ void JSC__VM__throwError(JSC__VM* vm_, JSC__JSGlobalObject* arg1, JSC__JSValue v
     scope.throwException(arg1, exception);
 }
 
-#pragma mark - JSC::ThrowScope
-
-void JSC__ThrowScope__clearException(JSC__ThrowScope* arg0)
-{
-    arg0->clearException();
-};
-bJSC__ThrowScope JSC__ThrowScope__declare(JSC__VM* arg0, unsigned char* arg1, unsigned char* arg2,
-    size_t arg3)
-{
-    Wrap<JSC::ThrowScope, bJSC__ThrowScope> wrapped = Wrap<JSC::ThrowScope, bJSC__ThrowScope>();
-    wrapped.cpp = new (wrapped.alignedBuffer()) JSC::ThrowScope(reinterpret_cast<JSC::VM&>(arg0));
-    return wrapped.result;
-};
-JSC__Exception* JSC__ThrowScope__exception(JSC__ThrowScope* arg0) { return arg0->exception(); }
-void JSC__ThrowScope__release(JSC__ThrowScope* arg0) { arg0->release(); }
-
-#pragma mark - JSC::CatchScope
-
-void JSC__CatchScope__clearException(JSC__CatchScope* arg0)
-{
-    arg0->clearException();
-}
-bJSC__CatchScope JSC__CatchScope__declare(JSC__VM* arg0, unsigned char* arg1, unsigned char* arg2,
-    size_t arg3)
-{
-    JSC::CatchScope scope = JSC::CatchScope(reinterpret_cast<JSC::VM&>(arg0));
-    return cast<bJSC__CatchScope>(&scope);
-}
-JSC__Exception* JSC__CatchScope__exception(JSC__CatchScope* arg0) { return arg0->exception(); }
-
 JSC__JSValue JSC__JSPromise__rejectedPromiseValue(JSC__JSGlobalObject* arg0,
     JSC__JSValue JSValue1)
 {
@@ -4091,9 +4063,18 @@ restart:
             if (key.len == 0)
                 return true;
 
-            JSC::JSValue propertyValue = objectToUse == object ? objectToUse->getDirect(entry.offset()) : JSValue();
+            JSC::JSValue propertyValue = JSValue();
+
+            if (objectToUse == object) {
+                propertyValue = objectToUse->getDirect(entry.offset());
+                if (!propertyValue) {
+                    scope.clearException();
+                    return true;
+                }
+            }
+
             if (!propertyValue || propertyValue.isGetterSetter() && !((entry.attributes() & PropertyAttribute::Accessor) != 0)) {
-                propertyValue = objectToUse->get(globalObject, prop);
+                propertyValue = objectToUse->getIfPropertyExists(globalObject, prop);
             }
 
             if (scope.exception())
diff --git a/src/bun.js/bindings/bindings.zig b/src/bun.js/bindings/bindings.zig
index 35c9d26fa..7e3fa6d8e 100644
--- a/src/bun.js/bindings/bindings.zig
+++ b/src/bun.js/bindings/bindings.zig
@@ -291,7 +291,27 @@ pub const ZigString = extern struct {
         return this.len * 2;
     }
 
-    /// Count the number of code points in the string.
+    pub fn utf16ByteLength(this: ZigString) usize {
+        if (this.isUTF8()) {
+            return bun.simdutf.length.utf16.from.utf8.le(this.slice());
+        }
+
+        if (this.is16Bit()) {
+            return this.len * 2;
+        }
+
+        return JSC.WebCore.Encoder.byteLengthU8(this.slice().ptr, this.slice().len, .utf16le);
+    }
+
+    pub fn latin1ByteLength(this: ZigString) usize {
+        if (this.isUTF8()) {
+            @panic("TODO");
+        }
+
+        return this.len;
+    }
+
+    /// Count the number of bytes in the UTF-8 version of the string.
     /// This function is slow. Use maxUITF8ByteLength() to get a quick estimate
     pub fn utf8ByteLength(this: ZigString) usize {
         if (this.isUTF8()) {
@@ -370,11 +390,11 @@ pub const ZigString = extern struct {
     }
 
     pub fn markStatic(this: *ZigString) void {
-        this.ptr = @intToPtr([*]const u8, @ptrToInt(this.ptr) | (1 << 60));
+        this.ptr = @ptrFromInt([*]const u8, @intFromPtr(this.ptr) | (1 << 60));
     }
 
     pub fn isStatic(this: *const ZigString) bool {
-        return @ptrToInt(this.ptr) & (1 << 60) != 0;
+        return @intFromPtr(this.ptr) & (1 << 60) != 0;
     }
 
     pub const Slice = struct {
@@ -483,7 +503,7 @@ pub const ZigString = extern struct {
         }
 
         pub fn mut(this: Slice) []u8 {
-            return @intToPtr([*]u8, @ptrToInt(this.ptr))[0..this.len];
+            return @ptrFromInt([*]u8, @intFromPtr(this.ptr))[0..this.len];
         }
 
         /// Does nothing if the slice is not allocated
@@ -504,7 +524,7 @@ pub const ZigString = extern struct {
     pub const namespace = "";
 
     pub inline fn is16Bit(this: *const ZigString) bool {
-        return (@ptrToInt(this._unsafe_ptr_do_not_use) & (1 << 63)) != 0;
+        return (@intFromPtr(this._unsafe_ptr_do_not_use) & (1 << 63)) != 0;
     }
 
     pub inline fn utf16Slice(this: *const ZigString) []align(1) const u16 {
@@ -539,7 +559,7 @@ pub const ZigString = extern struct {
     }
 
     pub fn sortDesc(slice_: []ZigString) void {
-        std.sort.sort(ZigString, slice_, {}, cmpDesc);
+        std.sort.block(ZigString, slice_, {}, cmpDesc);
     }
 
     pub fn cmpDesc(_: void, a: ZigString, b: ZigString) bool {
@@ -547,7 +567,7 @@ pub const ZigString = extern struct {
     }
 
     pub fn sortAsc(slice_: []ZigString) void {
-        std.sort.sort(ZigString, slice_, {}, cmpAsc);
+        std.sort.block(ZigString, slice_, {}, cmpAsc);
     }
 
     pub fn cmpAsc(_: void, a: ZigString, b: ZigString) bool {
@@ -641,15 +661,15 @@ pub const ZigString = extern struct {
     }
 
     pub fn isUTF8(this: ZigString) bool {
-        return (@ptrToInt(this._unsafe_ptr_do_not_use) & (1 << 61)) != 0;
+        return (@intFromPtr(this._unsafe_ptr_do_not_use) & (1 << 61)) != 0;
     }
 
     pub fn markUTF8(this: *ZigString) void {
-        this._unsafe_ptr_do_not_use = @intToPtr([*]const u8, @ptrToInt(this._unsafe_ptr_do_not_use) | (1 << 61));
+        this._unsafe_ptr_do_not_use = @ptrFromInt([*]const u8, @intFromPtr(this._unsafe_ptr_do_not_use) | (1 << 61));
     }
 
     pub fn markUTF16(this: *ZigString) void {
-        this._unsafe_ptr_do_not_use = @intToPtr([*]const u8, @ptrToInt(this._unsafe_ptr_do_not_use) | (1 << 63));
+        this._unsafe_ptr_do_not_use = @ptrFromInt([*]const u8, @intFromPtr(this._unsafe_ptr_do_not_use) | (1 << 63));
     }
 
     pub fn setOutputEncoding(this: *ZigString) void {
@@ -658,7 +678,7 @@ pub const ZigString = extern struct {
     }
 
     pub inline fn isGloballyAllocated(this: ZigString) bool {
-        return (@ptrToInt(this._unsafe_ptr_do_not_use) & (1 << 62)) != 0;
+        return (@intFromPtr(this._unsafe_ptr_do_not_use) & (1 << 62)) != 0;
     }
 
     pub inline fn deinitGlobal(this: ZigString) void {
@@ -668,7 +688,7 @@ pub const ZigString = extern struct {
     pub const mark = markGlobal;
 
     pub inline fn markGlobal(this: *ZigString) void {
-        this._unsafe_ptr_do_not_use = @intToPtr([*]const u8, @ptrToInt(this._unsafe_ptr_do_not_use) | (1 << 62));
+        this._unsafe_ptr_do_not_use = @ptrFromInt([*]const u8, @intFromPtr(this._unsafe_ptr_do_not_use) | (1 << 62));
     }
 
     pub fn format(self: ZigString, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
@@ -694,7 +714,7 @@ pub const ZigString = extern struct {
     inline fn untagged(ptr: [*]const u8) [*]const u8 {
         // this can be null ptr, so long as it's also a 0 length string
         @setRuntimeSafety(false);
-        return @intToPtr([*]const u8, @truncate(u53, @ptrToInt(ptr)));
+        return @ptrFromInt([*]const u8, @truncate(u53, @intFromPtr(ptr)));
     }
 
     pub fn slice(this: *const ZigString) []const u8 {
@@ -1303,7 +1323,7 @@ pub const FetchHeaders = opaque {
         this: *FetchHeaders,
         name_: HTTPHeaderName,
     ) bool {
-        return fastHas_(this, @enumToInt(name_));
+        return fastHas_(this, @intFromEnum(name_));
     }
 
     pub fn fastGet(
@@ -1311,7 +1331,7 @@ pub const FetchHeaders = opaque {
         name_: HTTPHeaderName,
     ) ?ZigString {
         var str = ZigString.init("");
-        fastGet_(this, @enumToInt(name_), &str);
+        fastGet_(this, @intFromEnum(name_), &str);
         if (str.len == 0) {
             return null;
         }
@@ -1441,7 +1461,7 @@ pub const FetchHeaders = opaque {
         this: *FetchHeaders,
         header: HTTPHeaderName,
     ) void {
-        return fastRemove_(this, @enumToInt(header));
+        return fastRemove_(this, @intFromEnum(header));
     }
 
     pub fn fastRemove_(
@@ -1561,10 +1581,10 @@ pub const FetchHeaders = opaque {
 pub const SystemError = extern struct {
     errno: c_int = 0,
     /// label for errno
-    code: ZigString = ZigString.init(""),
-    message: ZigString = ZigString.init(""),
-    path: ZigString = ZigString.init(""),
-    syscall: ZigString = ZigString.init(""),
+    code: String = String.empty,
+    message: String = String.empty,
+    path: String = String.empty,
+    syscall: String = String.empty,
     fd: i32 = -1,
 
     pub fn Maybe(comptime Result: type) type {
@@ -1611,11 +1631,11 @@ pub const Sizes = @import("../bindings/sizes.zig");
 pub const JSUint8Array = opaque {
     pub const name = "Uint8Array_alias";
     pub fn ptr(this: *JSUint8Array) [*]u8 {
-        return @intToPtr(*[*]u8, @ptrToInt(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayVector).*;
+        return @ptrFromInt(*[*]u8, @intFromPtr(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayVector).*;
     }
 
     pub fn len(this: *JSUint8Array) usize {
-        return @intToPtr(*usize, @ptrToInt(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayLength).*;
+        return @ptrFromInt(*usize, @intFromPtr(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayLength).*;
     }
 
     pub fn slice(this: *JSUint8Array) []u8 {
@@ -2045,6 +2065,9 @@ pub const JSPromise = extern struct {
     pub fn isHandled(this: *const JSPromise, vm: *VM) bool {
         return cppFn("isHandled", .{ this, vm });
     }
+    pub fn setHandled(this: *JSPromise, vm: *VM) void {
+        cppFn("setHandled", .{ this, vm });
+    }
 
     pub fn rejectWithCaughtException(this: *JSPromise, globalObject: *JSGlobalObject, scope: ThrowScope) void {
         return cppFn("rejectWithCaughtException", .{ this, globalObject, scope });
@@ -2115,6 +2138,7 @@ pub const JSPromise = extern struct {
         "asValue",
         "create",
         "isHandled",
+        "setHandled",
         "reject",
         "rejectAsHandled",
         "rejectAsHandledException",
@@ -2149,6 +2173,9 @@ pub const JSInternalPromise = extern struct {
     pub fn isHandled(this: *const JSInternalPromise, vm: *VM) bool {
         return cppFn("isHandled", .{ this, vm });
     }
+    pub fn setHandled(this: *JSInternalPromise, vm: *VM) void {
+        cppFn("setHandled", .{ this, vm });
+    }
 
     pub fn rejectWithCaughtException(this: *JSInternalPromise, globalObject: *JSGlobalObject, scope: ThrowScope) void {
         return cppFn("rejectWithCaughtException", .{ this, globalObject, scope });
@@ -2332,6 +2359,7 @@ pub const JSInternalPromise = extern struct {
         "status",
         "result",
         "isHandled",
+        "setHandled",
         "resolvedPromise",
         "rejectedPromise",
         "resolve",
@@ -2363,6 +2391,11 @@ pub const AnyPromise = union(enum) {
             inline else => |promise| promise.isHandled(vm),
         };
     }
+    pub fn setHandled(this: AnyPromise, vm: *VM) void {
+        switch (this) {
+            inline else => |promise| promise.setHandled(vm),
+        }
+    }
 
     pub fn rejectWithCaughtException(this: AnyPromise, globalObject: *JSGlobalObject, scope: ThrowScope) void {
         switch (this) {
@@ -2699,6 +2732,23 @@ pub const JSGlobalObject = extern struct {
             this.vm().throwError(this, this.createErrorInstance(Output.prettyFmt(fmt, false), args));
         }
     }
+    extern fn JSC__JSGlobalObject__queueMicrotaskCallback(*JSGlobalObject, *anyopaque, Function: *const (fn (*anyopaque) callconv(.C) void)) void;
+    pub fn queueMicrotaskCallback(
+        this: *JSGlobalObject,
+        ctx_val: anytype,
+        comptime Function: fn (ctx: @TypeOf(ctx_val)) void,
+    ) void {
+        JSC.markBinding(@src());
+        const Fn = Function;
+        const ContextType = @TypeOf(ctx_val);
+        const Wrapper = struct {
+            pub fn call(p: *anyopaque) callconv(.C) void {
+                Fn(bun.cast(ContextType, p));
+            }
+        };
+
+        JSC__JSGlobalObject__queueMicrotaskCallback(this, ctx_val, &Wrapper.call);
+    }
 
     pub fn queueMicrotask(
         this: *JSGlobalObject,
@@ -3156,7 +3206,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
         pub fn isObject(this: JSType) bool {
             // inline constexpr bool isObjectType(JSType type) { return type >= ObjectType; }
-            return @enumToInt(this) >= @enumToInt(JSType.Object);
+            return @intFromEnum(this) >= @intFromEnum(JSType.Object);
         }
 
         pub fn isFunction(this: JSType) bool {
@@ -3311,7 +3361,7 @@ pub const JSValue = enum(JSValueReprInt) {
     };
 
     pub inline fn cast(ptr: anytype) JSValue {
-        return @intToEnum(JSValue, @bitCast(i64, @ptrToInt(ptr)));
+        return @enumFromInt(JSValue, @bitCast(i64, @intFromPtr(ptr)));
     }
 
     pub fn coerceToInt32(this: JSValue, globalThis: *JSC.JSGlobalObject) i32 {
@@ -3395,6 +3445,7 @@ pub const JSValue = enum(JSValueReprInt) {
             c_int => @intCast(c_int, toInt32(this)),
             ?AnyPromise => asAnyPromise(this),
             u52 => @truncate(u52, @intCast(u64, @max(this.toInt64(), 0))),
+            i52 => @truncate(i52, @intCast(i52, this.toInt64())),
             u64 => toUInt64NoTruncate(this),
             u8 => @truncate(u8, toU32(this)),
             i16 => @truncate(i16, toInt32(this)),
@@ -3808,7 +3859,7 @@ pub const JSValue = enum(JSValueReprInt) {
             return jsNumberFromInt32(@intCast(i32, i));
         }
 
-        return jsNumberFromDouble(@intToFloat(f64, @truncate(i52, i)));
+        return jsNumberFromDouble(@floatFromInt(f64, @truncate(i52, i)));
     }
 
     pub inline fn toJS(this: JSValue, _: *const JSGlobalObject) JSValue {
@@ -3820,7 +3871,7 @@ pub const JSValue = enum(JSValueReprInt) {
             return jsNumberFromInt32(@intCast(i32, i));
         }
 
-        return jsNumberFromDouble(@intToFloat(f64, @intCast(i52, @truncate(u51, i))));
+        return jsNumberFromDouble(@floatFromInt(f64, @intCast(i52, @truncate(u51, i))));
     }
 
     pub fn coerceDoubleTruncatingIntoInt64(this: JSValue) i64 {
@@ -3834,7 +3885,7 @@ pub const JSValue = enum(JSValueReprInt) {
             return if (double_value < 0) @as(i64, std.math.minInt(i64)) else @as(i64, std.math.maxInt(i64));
         }
 
-        return @floatToInt(
+        return @intFromFloat(
             i64,
             double_value,
         );
@@ -3871,26 +3922,26 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn isUndefined(this: JSValue) bool {
-        return @enumToInt(this) == 0xa;
+        return @intFromEnum(this) == 0xa;
     }
     pub inline fn isNull(this: JSValue) bool {
-        return @enumToInt(this) == 0x2;
+        return @intFromEnum(this) == 0x2;
     }
     pub inline fn isEmptyOrUndefinedOrNull(this: JSValue) bool {
-        return switch (@enumToInt(this)) {
+        return switch (@intFromEnum(this)) {
             0, 0xa, 0x2 => true,
             else => false,
         };
     }
     pub fn isUndefinedOrNull(this: JSValue) bool {
-        return switch (@enumToInt(this)) {
+        return switch (@intFromEnum(this)) {
             0xa, 0x2 => true,
             else => false,
         };
     }
     /// Empty as in "JSValue {}" rather than an empty string
     pub inline fn isEmpty(this: JSValue) bool {
-        return switch (@enumToInt(this)) {
+        return switch (@intFromEnum(this)) {
             0 => true,
             else => false,
         };
@@ -4016,7 +4067,7 @@ pub const JSValue = enum(JSValueReprInt) {
     pub inline fn isCell(this: JSValue) bool {
         return switch (this) {
             .zero, .undefined, .null, .true, .false => false,
-            else => (@bitCast(u64, @enumToInt(this)) & FFI.NotCellMask) == 0,
+            else => (@bitCast(u64, @intFromEnum(this)) & FFI.NotCellMask) == 0,
         };
     }
 
@@ -4179,7 +4230,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     // intended to be more lightweight than ZigString
     pub fn fastGet(this: JSValue, global: *JSGlobalObject, builtin_name: BuiltinName) ?JSValue {
-        const result = fastGet_(this, global, @enumToInt(builtin_name));
+        const result = fastGet_(this, global, @intFromEnum(builtin_name));
         if (result == .zero) {
             return null;
         }
@@ -4188,7 +4239,7 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub fn fastGetDirect(this: JSValue, global: *JSGlobalObject, builtin_name: BuiltinName) ?JSValue {
-        const result = fastGetDirect_(this, global, @enumToInt(builtin_name));
+        const result = fastGetDirect_(this, global, @intFromEnum(builtin_name));
         if (result == .zero) {
             return null;
         }
@@ -4243,7 +4294,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     pub fn get(this: JSValue, global: *JSGlobalObject, property: []const u8) ?JSValue {
         const value = getIfPropertyExistsImpl(this, global, property.ptr, @intCast(u32, property.len));
-        return if (@enumToInt(value) != 0) value else return null;
+        return if (@intFromEnum(value) != 0) value else return null;
     }
 
     pub fn implementsToString(this: JSValue, global: *JSGlobalObject) bool {
@@ -4407,7 +4458,7 @@ pub const JSValue = enum(JSValueReprInt) {
     /// This algorithm differs from the IsStrictlyEqual Algorithm by treating all NaN values as equivalent and by differentiating +0𝔽 from -0𝔽.
     /// https://tc39.es/ecma262/#sec-samevalue
     pub fn isSameValue(this: JSValue, other: JSValue, global: *JSGlobalObject) bool {
-        return @enumToInt(this) == @enumToInt(other) or cppFn("isSameValue", .{ this, other, global });
+        return @intFromEnum(this) == @intFromEnum(other) or cppFn("isSameValue", .{ this, other, global });
     }
 
     pub fn deepEquals(this: JSValue, other: JSValue, global: *JSGlobalObject) bool {
@@ -4460,7 +4511,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     /// Get the internal number of the `JSC::DateInstance` object
     /// Returns NaN if the value is not a `JSC::DateInstance` (`Date` in JS)
-     pub fn getUnixTimestamp(this: JSValue) f64 {
+    pub fn getUnixTimestamp(this: JSValue) f64 {
         return cppFn("getUnixTimestamp", .{
             this,
         });
@@ -4492,7 +4543,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     pub fn asNumber(this: JSValue) f64 {
         if (this.isInt32()) {
-            return @intToFloat(f64, this.asInt32());
+            return @floatFromInt(f64, this.asInt32());
         }
 
         if (isNumber(this)) {
@@ -4515,19 +4566,19 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub fn asPtr(this: JSValue, comptime Pointer: type) *Pointer {
-        return @intToPtr(*Pointer, this.asPtrAddress());
+        return @ptrFromInt(*Pointer, this.asPtrAddress());
     }
 
     pub fn fromPtrAddress(addr: anytype) JSValue {
-        return jsNumber(@intToFloat(f64, @bitCast(usize, @as(usize, addr))));
+        return jsNumber(@floatFromInt(f64, @bitCast(usize, @as(usize, addr))));
     }
 
     pub fn asPtrAddress(this: JSValue) usize {
-        return @bitCast(usize, @floatToInt(usize, this.asDouble()));
+        return @bitCast(usize, @intFromFloat(usize, this.asDouble()));
     }
 
     pub fn fromPtr(addr: anytype) JSValue {
-        return fromPtrAddress(@ptrToInt(addr));
+        return fromPtrAddress(@intFromPtr(addr));
     }
 
     pub fn toBooleanSlow(this: JSValue, global: *JSGlobalObject) bool {
@@ -4546,13 +4597,20 @@ pub const JSValue = enum(JSValueReprInt) {
         return FFI.JSVALUE_TO_BOOL(.{ .asJSValue = this });
     }
 
+    pub inline fn asInt52(this: JSValue) i64 {
+        if (comptime bun.Environment.allow_assert) {
+            std.debug.assert(this.isNumber());
+        }
+        return @intFromFloat(i64, @max(@min(this.asDouble(), std.math.maxInt(i52)), std.math.minInt(i52)));
+    }
+
     pub fn toInt32(this: JSValue) i32 {
         if (this.isInt32()) {
             return asInt32(this);
         }
 
         if (this.isNumber()) {
-            return @truncate(i32, @floatToInt(i64, asDouble(this)));
+            return @truncate(i32, this.asInt52());
         }
 
         if (comptime bun.Environment.allow_assert) {
@@ -4570,11 +4628,11 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn toU16(this: JSValue) u16 {
-        return @truncate(u16, this.toU32());
+        return @truncate(u16, @max(this.toInt32(), 0));
     }
 
     pub inline fn toU32(this: JSValue) u32 {
-        return @intCast(u32, @max(this.toInt32(), 0));
+        return @intCast(u32, @min(@max(this.toInt64(), 0), std.math.maxInt(u32)));
     }
 
     /// This function supports:
@@ -4591,11 +4649,11 @@ pub const JSValue = enum(JSValueReprInt) {
     /// If the "length" property does not exist, this function will return 0.
     pub fn getLength(this: JSValue, globalThis: *JSGlobalObject) u64 {
         const len = this.getLengthIfPropertyExistsInternal(globalThis);
-        if (len == std.math.f64_max) {
+        if (len == std.math.floatMax(f64)) {
             return 0;
         }
 
-        return @floatToInt(u64, @max(len, 0));
+        return @intFromFloat(u64, @max(@min(len, std.math.maxInt(i52)), 0));
     }
 
     /// This function supports:
@@ -4612,11 +4670,11 @@ pub const JSValue = enum(JSValueReprInt) {
     /// If the "length" property does not exist, this function will return null.
     pub fn tryGetLength(this: JSValue, globalThis: *JSGlobalObject) ?f64 {
         const len = this.getLengthIfPropertyExistsInternal(globalThis);
-        if (len == std.math.f64_max) {
+        if (len == std.math.floatMax(f64)) {
             return null;
         }
 
-        return @floatToInt(u64, @max(len, 0));
+        return @intFromFloat(u64, @max(@min(len, std.math.maxInt(i52)), 0));
     }
 
     /// Do not use this directly!
@@ -4661,15 +4719,15 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn asRef(this: JSValue) C_API.JSValueRef {
-        return @intToPtr(C_API.JSValueRef, @bitCast(usize, @enumToInt(this)));
+        return @ptrFromInt(C_API.JSValueRef, @bitCast(usize, @intFromEnum(this)));
     }
 
     pub inline fn c(this: C_API.JSValueRef) JSValue {
-        return @intToEnum(JSValue, @bitCast(JSValue.Type, @ptrToInt(this)));
+        return @enumFromInt(JSValue, @bitCast(JSValue.Type, @intFromPtr(this)));
     }
 
     pub inline fn fromRef(this: C_API.JSValueRef) JSValue {
-        return @intToEnum(JSValue, @bitCast(JSValue.Type, @ptrToInt(this)));
+        return @enumFromInt(JSValue, @bitCast(JSValue.Type, @intFromPtr(this)));
     }
 
     pub inline fn asObjectRef(this: JSValue) C_API.JSObjectRef {
@@ -4685,12 +4743,12 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn asNullableVoid(this: JSValue) ?*anyopaque {
-        return @intToPtr(?*anyopaque, @bitCast(usize, @enumToInt(this)));
+        return @ptrFromInt(?*anyopaque, @bitCast(usize, @intFromEnum(this)));
     }
 
     pub inline fn asVoid(this: JSValue) *anyopaque {
         if (comptime bun.Environment.allow_assert) {
-            if (@enumToInt(this) == 0) {
+            if (@intFromEnum(this) == 0) {
                 @panic("JSValue is null");
             }
         }
@@ -4857,7 +4915,7 @@ pub const Exception = extern struct {
     pub fn create(globalObject: *JSGlobalObject, object: *JSObject, stack_capture: StackCaptureAction) *Exception {
         return cppFn(
             "create",
-            .{ globalObject, object, @enumToInt(stack_capture) },
+            .{ globalObject, object, @intFromEnum(stack_capture) },
         );
     }
 
@@ -4893,7 +4951,7 @@ pub const VM = extern struct {
         LargeHeap = 1,
     };
     pub fn create(heap_type: HeapType) *VM {
-        return cppFn("create", .{@enumToInt(heap_type)});
+        return cppFn("create", .{@intFromEnum(heap_type)});
     }
 
     pub fn deinit(vm: *VM, global_object: *JSGlobalObject) void {
@@ -5157,16 +5215,16 @@ pub const CallFrame = opaque {
     pub fn arguments(self: *const CallFrame, comptime max: usize) Arguments(max) {
         const len = self.argumentsCount();
         var ptr = self.argumentsPtr();
-        return switch (@min(len, max)) {
+        return switch (@as(u4, @min(len, max))) {
             0 => .{ .ptr = undefined, .len = 0 },
-            1 => Arguments(max).init(1, ptr),
-            2 => Arguments(max).init(@min(2, max), ptr),
-            3 => Arguments(max).init(@min(3, max), ptr),
-            4 => Arguments(max).init(@min(4, max), ptr),
-            5 => Arguments(max).init(@min(5, max), ptr),
-            6 => Arguments(max).init(@min(6, max), ptr),
-            7 => Arguments(max).init(@min(7, max), ptr),
-            8 => Arguments(max).init(@min(8, max), ptr),
+            4 => Arguments(max).init(comptime @min(4, max), ptr),
+            2 => Arguments(max).init(comptime @min(2, max), ptr),
+            6 => Arguments(max).init(comptime @min(6, max), ptr),
+            3 => Arguments(max).init(comptime @min(3, max), ptr),
+            8 => Arguments(max).init(comptime @min(8, max), ptr),
+            5 => Arguments(max).init(comptime @min(5, max), ptr),
+            1 => Arguments(max).init(comptime @min(1, max), ptr),
+            7 => Arguments(max).init(comptime @min(7, max), ptr),
             else => unreachable,
         };
     }
@@ -5555,6 +5613,7 @@ pub const __DOMCall__reader_u64 = @import("../api/bun.zig").FFI.Reader.Class.fun
 pub const __DOMCall__reader_intptr = @import("../api/bun.zig").FFI.Reader.Class.functionDefinitions.intptr;
 pub const __Crypto_getRandomValues = @import("../webcore.zig").Crypto.Class.functionDefinitions.getRandomValues;
 pub const __Crypto_randomUUID = @import("../webcore.zig").Crypto.Class.functionDefinitions.randomUUID;
+pub const __Crypto_randomInt = @import("../webcore.zig").Crypto.Class.functionDefinitions.randomInt;
 pub const __Crypto_timingSafeEqual = @import("../webcore.zig").Crypto.Class.functionDefinitions.timingSafeEqual;
 pub const DOMCalls = .{
     @import("../api/bun.zig").FFI,
diff --git a/src/bun.js/bindings/exports.zig b/src/bun.js/bindings/exports.zig
index 6ea1eba60..e9e9d3a8d 100644
--- a/src/bun.js/bindings/exports.zig
+++ b/src/bun.js/bindings/exports.zig
@@ -29,6 +29,7 @@ const Backtrace = @import("../../crash_reporter.zig");
 const JSPrinter = bun.js_printer;
 const JSLexer = bun.js_lexer;
 const typeBaseName = @import("../../meta.zig").typeBaseName;
+const String = bun.String;
 
 pub const ZigGlobalObject = extern struct {
     pub const shim = Shimmer("Zig", "GlobalObject", @This());
@@ -112,11 +113,11 @@ pub const ErrorCode = enum(ErrorCodeInt) {
     _,
 
     pub inline fn from(code: anyerror) ErrorCode {
-        return @intToEnum(ErrorCode, @errorToInt(code));
+        return @enumFromInt(ErrorCode, @intFromError(code));
     }
 
-    pub const ParserError = @enumToInt(ErrorCode.from(error.ParserError));
-    pub const JSErrorObject = @enumToInt(ErrorCode.from(error.JSErrorObject));
+    pub const ParserError = @intFromEnum(ErrorCode.from(error.ParserError));
+    pub const JSErrorObject = @intFromEnum(ErrorCode.from(error.JSErrorObject));
 
     pub const Type = ErrorCodeInt;
 };
@@ -216,9 +217,10 @@ pub const ResolvedSource = extern struct {
 
     pub const Tag = enum(u64) {
         javascript = 0,
-        wasm = 1,
-        object = 2,
-        file = 3,
+        package_json_type_module = 1,
+        wasm = 2,
+        object = 3,
+        file = 4,
 
         @"node:buffer" = 1024,
         @"node:process" = 1025,
@@ -244,7 +246,7 @@ export fn ZigString__free(raw: [*]const u8, len: usize, allocator_: ?*anyopaque)
 }
 
 export fn ZigString__free_global(ptr: [*]const u8, len: usize) void {
-    var untagged = @intToPtr(*anyopaque, @ptrToInt(ZigString.init(ptr[0..len]).slice().ptr));
+    var untagged = @ptrFromInt(*anyopaque, @intFromPtr(ZigString.init(ptr[0..len]).slice().ptr));
     if (comptime Environment.allow_assert) {
         std.debug.assert(Mimalloc.mi_is_in_heap_region(ptr));
     }
@@ -437,7 +439,7 @@ pub const Process = extern struct {
 };
 
 pub const ZigStackTrace = extern struct {
-    source_lines_ptr: [*c]ZigString,
+    source_lines_ptr: [*c]bun.String,
     source_lines_numbers: [*c]i32,
     source_lines_len: u8,
     source_lines_to_collect: u8,
@@ -455,23 +457,24 @@ pub const ZigStackTrace = extern struct {
         {
             var source_lines_iter = this.sourceLineIterator();
 
-            var source_line_len: usize = 0;
-            var count: usize = 0;
-            while (source_lines_iter.next()) |source| {
-                count += 1;
-                source_line_len += source.text.len;
-            }
+            var source_line_len = source_lines_iter.getLength();
 
-            if (count > 0 and source_line_len > 0) {
-                var source_lines = try allocator.alloc(Api.SourceLine, count);
+            if (source_line_len > 0) {
+                var source_lines = try allocator.alloc(Api.SourceLine, @intCast(usize, @max(source_lines_iter.i + 1, 0)));
                 var source_line_buf = try allocator.alloc(u8, source_line_len);
                 source_lines_iter = this.sourceLineIterator();
                 var remain_buf = source_line_buf[0..];
                 var i: usize = 0;
                 while (source_lines_iter.next()) |source| {
-                    bun.copy(u8, remain_buf, source.text);
-                    const copied_line = remain_buf[0..source.text.len];
-                    remain_buf = remain_buf[source.text.len..];
+                    const text = source.text.slice();
+                    defer source.text.deinit();
+                    bun.copy(
+                        u8,
+                        remain_buf,
+                        text,
+                    );
+                    const copied_line = remain_buf[0..text.len];
+                    remain_buf = remain_buf[text.len..];
                     source_lines[i] = .{ .text = copied_line, .line = source.line };
                     i += 1;
                 }
@@ -507,9 +510,18 @@ pub const ZigStackTrace = extern struct {
 
         pub const SourceLine = struct {
             line: i32,
-            text: string,
+            text: ZigString.Slice,
         };
 
+        pub fn getLength(this: *SourceLineIterator) usize {
+            var count: usize = 0;
+            for (this.trace.source_lines_ptr[0..@intCast(usize, this.i + 1)]) |*line| {
+                count += line.length();
+            }
+
+            return count;
+        }
+
         pub fn untilLast(this: *SourceLineIterator) ?SourceLine {
             if (this.i < 1) return null;
             return this.next();
@@ -521,7 +533,7 @@ pub const ZigStackTrace = extern struct {
             const source_line = this.trace.source_lines_ptr[@intCast(usize, this.i)];
             const result = SourceLine{
                 .line = this.trace.source_lines_numbers[@intCast(usize, this.i)],
-                .text = source_line.slice(),
+                .text = source_line.toUTF8(bun.default_allocator),
             };
             this.i -= 1;
             return result;
@@ -540,28 +552,35 @@ pub const ZigStackTrace = extern struct {
 };
 
 pub const ZigStackFrame = extern struct {
-    function_name: ZigString,
-    source_url: ZigString,
+    function_name: String,
+    source_url: String,
     position: ZigStackFramePosition,
     code_type: ZigStackFrameCode,
 
     /// This informs formatters whether to display as a blob URL or not
     remapped: bool = false,
 
+    pub fn deinit(this: *ZigStackFrame) void {
+        this.function_name.deref();
+        this.source_url.deref();
+    }
+
     pub fn toAPI(this: *const ZigStackFrame, root_path: string, origin: ?*const ZigURL, allocator: std.mem.Allocator) !Api.StackFrame {
         var frame: Api.StackFrame = comptime std.mem.zeroes(Api.StackFrame);
-        if (this.function_name.len > 0) {
-            frame.function_name = try allocator.dupe(u8, this.function_name.slice());
+        if (!this.function_name.isEmpty()) {
+            var slicer = this.function_name.toUTF8(allocator);
+            defer slicer.deinit();
+            frame.function_name = (try slicer.clone(allocator)).slice();
         }
 
-        if (this.source_url.len > 0) {
+        if (!this.source_url.isEmpty()) {
             frame.file = try std.fmt.allocPrint(allocator, "{any}", .{this.sourceURLFormatter(root_path, origin, true, false)});
         }
 
         frame.position.source_offset = this.position.source_offset;
 
         // For remapped code, we add 1 to the line number
-        frame.position.line = this.position.line + @as(i32, @boolToInt(this.remapped));
+        frame.position.line = this.position.line + @as(i32, @intFromBool(this.remapped));
 
         frame.position.line_start = this.position.line_start;
         frame.position.line_stop = this.position.line_stop;
@@ -569,13 +588,13 @@ pub const ZigStackFrame = extern struct {
         frame.position.column_stop = this.position.column_stop;
         frame.position.expression_start = this.position.expression_start;
         frame.position.expression_stop = this.position.expression_stop;
-        frame.scope = @intToEnum(Api.StackFrameScope, @enumToInt(this.code_type));
+        frame.scope = @enumFromInt(Api.StackFrameScope, @intFromEnum(this.code_type));
 
         return frame;
     }
 
     pub const SourceURLFormatter = struct {
-        source_url: ZigString,
+        source_url: bun.String,
         position: ZigStackFramePosition,
         enable_color: bool,
         origin: ?*const ZigURL,
@@ -587,7 +606,9 @@ pub const ZigStackFrame = extern struct {
                 try writer.writeAll(Output.prettyFmt("<r><cyan>", true));
             }
 
-            var source_slice = this.source_url.slice();
+            var source_slice_ = this.source_url.toUTF8(bun.default_allocator);
+            var source_slice = source_slice_.slice();
+            defer source_slice_.deinit();
 
             if (!this.remapped) {
                 if (this.origin) |origin| {
@@ -646,12 +667,12 @@ pub const ZigStackFrame = extern struct {
     };
 
     pub const NameFormatter = struct {
-        function_name: ZigString,
+        function_name: String,
         code_type: ZigStackFrameCode,
         enable_color: bool,
 
         pub fn format(this: NameFormatter, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
-            const name = this.function_name.slice();
+            const name = this.function_name;
 
             switch (this.code_type) {
                 .Eval => {
@@ -661,26 +682,26 @@ pub const ZigStackFrame = extern struct {
                     // try writer.writeAll("(esm)");
                 },
                 .Function => {
-                    if (name.len > 0) {
+                    if (!name.isEmpty()) {
                         if (this.enable_color) {
-                            try std.fmt.format(writer, comptime Output.prettyFmt("<r><b><i>{s}<r>", true), .{name});
+                            try std.fmt.format(writer, comptime Output.prettyFmt("<r><b><i>{}<r>", true), .{name});
                         } else {
-                            try std.fmt.format(writer, "{s}", .{name});
+                            try std.fmt.format(writer, "{}", .{name});
                         }
                     }
                 },
                 .Global => {
-                    if (name.len > 0) {
-                        try std.fmt.format(writer, "globalThis {s}", .{name});
+                    if (!name.isEmpty()) {
+                        try std.fmt.format(writer, "globalThis {}", .{name});
                     } else {
                         try writer.writeAll("globalThis");
                     }
                 },
                 .Wasm => {
-                    try std.fmt.format(writer, "WASM {s}", .{name});
+                    try std.fmt.format(writer, "WASM {}", .{name});
                 },
                 .Constructor => {
-                    try std.fmt.format(writer, "new {s}", .{name});
+                    try std.fmt.format(writer, "new {}", .{name});
                 },
                 else => {},
             }
@@ -688,9 +709,9 @@ pub const ZigStackFrame = extern struct {
     };
 
     pub const Zero: ZigStackFrame = ZigStackFrame{
-        .function_name = ZigString{ ._unsafe_ptr_do_not_use = "", .len = 0 },
+        .function_name = String.empty,
         .code_type = ZigStackFrameCode.None,
-        .source_url = ZigString{ ._unsafe_ptr_do_not_use = "", .len = 0 },
+        .source_url = String.empty,
         .position = ZigStackFramePosition.Invalid,
     };
 
@@ -743,14 +764,14 @@ pub const ZigException = extern struct {
     /// SystemError only
     errno: c_int = 0,
     /// SystemError only
-    syscall: ZigString = ZigString.Empty,
+    syscall: String = String.empty,
     /// SystemError only
-    system_code: ZigString = ZigString.Empty,
+    system_code: String = String.empty,
     /// SystemError only
-    path: ZigString = ZigString.Empty,
+    path: String = String.empty,
 
-    name: ZigString,
-    message: ZigString,
+    name: String,
+    message: String,
     stack: ZigStackTrace,
 
     exception: ?*anyopaque,
@@ -759,6 +780,19 @@ pub const ZigException = extern struct {
 
     fd: i32 = -1,
 
+    pub fn deinit(this: *ZigException) void {
+        this.syscall.deref();
+        this.system_code.deref();
+        this.path.deref();
+
+        this.name.deref();
+        this.message.deref();
+
+        for (this.stack.frames_ptr[0..this.stack.frames_len]) |*frame| {
+            frame.deinit();
+        }
+    }
+
     pub const shim = Shimmer("Zig", "Exception", @This());
     pub const name = "ZigException";
     pub const namespace = shim.namespace;
@@ -767,7 +801,7 @@ pub const ZigException = extern struct {
         const frame_count = 32;
         pub const source_lines_count = 6;
         source_line_numbers: [source_lines_count]i32,
-        source_lines: [source_lines_count]ZigString,
+        source_lines: [source_lines_count]String,
         frames: [frame_count]ZigStackFrame,
         loaded: bool,
         zig_exception: ZigException,
@@ -775,18 +809,18 @@ pub const ZigException = extern struct {
         pub const Zero: Holder = Holder{
             .frames = brk: {
                 var _frames: [frame_count]ZigStackFrame = undefined;
-                std.mem.set(ZigStackFrame, &_frames, ZigStackFrame.Zero);
+                @memset(&_frames, ZigStackFrame.Zero);
                 break :brk _frames;
             },
             .source_line_numbers = brk: {
                 var lines: [source_lines_count]i32 = undefined;
-                std.mem.set(i32, &lines, -1);
+                @memset(&lines, -1);
                 break :brk lines;
             },
 
             .source_lines = brk: {
-                var lines: [source_lines_count]ZigString = undefined;
-                std.mem.set(ZigString, &lines, ZigString.Empty);
+                var lines: [source_lines_count]String = undefined;
+                @memset(&lines, String.empty);
                 break :brk lines;
             },
             .zig_exception = undefined,
@@ -797,13 +831,17 @@ pub const ZigException = extern struct {
             return Holder.Zero;
         }
 
+        pub fn deinit(this: *Holder) void {
+            this.zigException().deinit();
+        }
+
         pub fn zigException(this: *Holder) *ZigException {
             if (!this.loaded) {
                 this.zig_exception = ZigException{
-                    .code = @intToEnum(JSErrorCode, 255),
+                    .code = @enumFromInt(JSErrorCode, 255),
                     .runtime_type = JSRuntimeType.Nothing,
-                    .name = ZigString.Empty,
-                    .message = ZigString.Empty,
+                    .name = String.empty,
+                    .message = String.empty,
                     .exception = null,
                     .stack = ZigStackTrace{
                         .source_lines_ptr = &this.source_lines,
@@ -831,13 +869,18 @@ pub const ZigException = extern struct {
         root_path: string,
         origin: ?*const ZigURL,
     ) !void {
-        const _name: string = @field(this, "name").slice();
-        const message: string = @field(this, "message").slice();
+        const name_slice = @field(this, "name").toUTF8(bun.default_allocator);
+        const message_slice = @field(this, "message").toUTF8(bun.default_allocator);
+
+        const _name = name_slice.slice();
+        defer name_slice.deinit();
+        const message = message_slice.slice();
+        defer message_slice.deinit();
 
         var is_empty = true;
         var api_exception = Api.JsException{
-            .runtime_type = @enumToInt(this.runtime_type),
-            .code = @enumToInt(this.code),
+            .runtime_type = @intFromEnum(this.runtime_type),
+            .code = @intFromEnum(this.code),
         };
 
         if (_name.len > 0) {
@@ -1314,7 +1357,7 @@ pub const ZigConsoleClient = struct {
             };
 
             pub fn getAdvanced(value: JSValue, globalThis: *JSGlobalObject, opts: Options) Result {
-                switch (@enumToInt(value)) {
+                switch (@intFromEnum(value)) {
                     0, 0xa => return Result{
                         .tag = .Undefined,
                     },
@@ -1374,23 +1417,20 @@ pub const ZigConsoleClient = struct {
 
                 // If we check an Object has a method table and it does not
                 // it will crash
-                const callable = js_type != .Object and value.isCallable(globalThis.vm());
-
-                if (value.isClass(globalThis) and !callable) {
-                    return .{
-                        .tag = .Object,
-                        .cell = js_type,
-                    };
-                }
+                if (js_type != .Object and value.isCallable(globalThis.vm())) {
+                    if (value.isClass(globalThis)) {
+                        return .{
+                            .tag = .Class,
+                            .cell = js_type,
+                        };
+                    }
 
-                if (callable and js_type == .JSFunction) {
-                    return .{
-                        .tag = .Function,
-                        .cell = js_type,
-                    };
-                } else if (callable and js_type == .InternalFunction) {
                     return .{
-                        .tag = .Object,
+                        // TODO: we print InternalFunction as Object because we have a lot of
+                        // callable namespaces and printing the contents of it is better than [Function: namespace]
+                        // ideally, we would print [Function: namespace] { ... } on all functions, internal and js.
+                        // what we'll do later is rid of .Function and .Class and handle the prefix in the .Object formatter
+                        .tag = if (js_type == .InternalFunction) .Object else .Function,
                         .cell = js_type,
                     };
                 }
@@ -1713,7 +1753,7 @@ pub const ZigConsoleClient = struct {
                 parent: JSValue,
                 const enable_ansi_colors = enable_ansi_colors_;
                 pub fn handleFirstProperty(this: *@This(), globalThis: *JSC.JSGlobalObject, value: JSValue) void {
-                    if (!value.jsType().isFunction() and !value.isClass(globalThis)) {
+                    if (!value.jsType().isFunction()) {
                         var writer = WrappedWriter(Writer){
                             .ctx = this.writer,
                             .failed = false,
@@ -1878,7 +1918,7 @@ pub const ZigConsoleClient = struct {
                     this.map = this.map_node.?.data;
                 }
 
-                var entry = this.map.getOrPut(@enumToInt(value)) catch unreachable;
+                var entry = this.map.getOrPut(@intFromEnum(value)) catch unreachable;
                 if (entry.found_existing) {
                     writer.writeAll(comptime Output.prettyFmt("<r><cyan>[Circular]<r>", enable_ansi_colors));
                     return;
@@ -1887,7 +1927,7 @@ pub const ZigConsoleClient = struct {
 
             defer {
                 if (comptime Format.canHaveCircularReferences()) {
-                    _ = this.map.remove(@enumToInt(value));
+                    _ = this.map.remove(@intFromEnum(value));
                 }
             }
 
@@ -1959,7 +1999,7 @@ pub const ZigConsoleClient = struct {
                             i = -i;
                         }
                         const digits = if (i != 0)
-                            bun.fmt.fastDigitCount(@intCast(usize, i)) + @as(usize, @boolToInt(is_negative))
+                            bun.fmt.fastDigitCount(@intCast(usize, i)) + @as(usize, @intFromBool(is_negative))
                         else
                             1;
                         this.addForNewLine(digits);
@@ -2051,9 +2091,9 @@ pub const ZigConsoleClient = struct {
                     this.addForNewLine(printable.len);
 
                     if (printable.len == 0) {
-                        writer.print(comptime Output.prettyFmt("[class]", enable_ansi_colors), .{});
+                        writer.print(comptime Output.prettyFmt("<cyan>[class]<r>", enable_ansi_colors), .{});
                     } else {
-                        writer.print(comptime Output.prettyFmt("[class <cyan>{}<r>]", enable_ansi_colors), .{printable});
+                        writer.print(comptime Output.prettyFmt("<cyan>[class {}]<r>", enable_ansi_colors), .{printable});
                     }
                 },
                 .Function => {
@@ -2063,7 +2103,7 @@ pub const ZigConsoleClient = struct {
                     if (printable.len == 0) {
                         writer.print(comptime Output.prettyFmt("<cyan>[Function]<r>", enable_ansi_colors), .{});
                     } else {
-                        writer.print(comptime Output.prettyFmt("<cyan>[Function<d>:<r> <cyan>{}]<r>", enable_ansi_colors), .{printable});
+                        writer.print(comptime Output.prettyFmt("<cyan>[Function: {}]<r>", enable_ansi_colors), .{printable});
                     }
                 },
                 .Getter => {
@@ -2220,11 +2260,11 @@ pub const ZigConsoleClient = struct {
                     } else if (value.as(JSC.ResolveMessage)) |resolve_log| {
                         resolve_log.msg.writeFormat(writer_, enable_ansi_colors) catch {};
                         return;
-                    } else if (value.as(JSC.Jest.ExpectAnything) != null) {
+                    } else if (value.as(JSC.Expect.ExpectAnything) != null) {
                         writer.writeAll("Anything");
                         return;
-                    } else if (value.as(JSC.Jest.ExpectAny) != null) {
-                        const constructor_value = JSC.Jest.ExpectAny.constructorValueGetCached(value) orelse return;
+                    } else if (value.as(JSC.Expect.ExpectAny) != null) {
+                        const constructor_value = JSC.Expect.ExpectAny.constructorValueGetCached(value) orelse return;
 
                         this.addForNewLine("Any<".len);
                         writer.writeAll("Any<");
@@ -2237,16 +2277,16 @@ pub const ZigConsoleClient = struct {
                         writer.writeAll(">");
 
                         return;
-                    } else if (value.as(JSC.Jest.ExpectStringContaining) != null) {
-                        const substring_value = JSC.Jest.ExpectStringContaining.stringValueGetCached(value) orelse return;
+                    } else if (value.as(JSC.Expect.ExpectStringContaining) != null) {
+                        const substring_value = JSC.Expect.ExpectStringContaining.stringValueGetCached(value) orelse return;
 
                         this.addForNewLine("StringContaining ".len);
                         writer.writeAll("StringContaining ");
                         this.printAs(.String, Writer, writer_, substring_value, .String, enable_ansi_colors);
 
                         return;
-                    } else if (value.as(JSC.Jest.ExpectStringMatching) != null) {
-                        const test_value = JSC.Jest.ExpectStringMatching.testValueGetCached(value) orelse return;
+                    } else if (value.as(JSC.Expect.ExpectStringMatching) != null) {
+                        const test_value = JSC.Expect.ExpectStringMatching.testValueGetCached(value) orelse return;
 
                         this.addForNewLine("StringMatching ".len);
                         writer.writeAll("StringMatching ");
@@ -2559,7 +2599,7 @@ pub const ZigConsoleClient = struct {
                             {
                                 this.indent += 1;
                                 defer this.indent -|= 1;
-                                const count_without_children = props_iter.len - @as(usize, @boolToInt(children_prop != null));
+                                const count_without_children = props_iter.len - @as(usize, @intFromBool(children_prop != null));
 
                                 while (props_iter.next()) |prop| {
                                     if (prop.eqlComptime("children"))
@@ -2759,7 +2799,7 @@ pub const ZigConsoleClient = struct {
                     }
 
                     if (iter.i == 0) {
-                        if (value.isClass(this.globalThis) and !value.isCallable(this.globalThis.vm()))
+                        if (value.isClass(this.globalThis))
                             this.printAs(.Class, Writer, writer_, value, jsType, enable_ansi_colors)
                         else if (value.isCallable(this.globalThis.vm()))
                             this.printAs(.Function, Writer, writer_, value, jsType, enable_ansi_colors)
@@ -3002,7 +3042,7 @@ pub const ZigConsoleClient = struct {
         chars: [*]const u8,
         len: usize,
     ) callconv(.C) void {
-        const id = std.hash.Wyhash.hash(0, chars[0..len]);
+        const id = bun.hash(chars[0..len]);
         if (!pending_time_logs_loaded) {
             pending_time_logs = PendingTimers.init(default_allocator);
             pending_time_logs_loaded = true;
@@ -3026,12 +3066,12 @@ pub const ZigConsoleClient = struct {
             return;
         }
 
-        const id = std.hash.Wyhash.hash(0, chars[0..len]);
+        const id = bun.hash(chars[0..len]);
         var result = (pending_time_logs.fetchPut(id, null) catch null) orelse return;
         var value: std.time.Timer = result.value orelse return;
         // get the duration in microseconds
         // then display it in milliseconds
-        Output.printElapsed(@intToFloat(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
+        Output.printElapsed(@floatFromInt(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
         switch (len) {
             0 => Output.printErrorln("\n", .{}),
             else => Output.printErrorln(" {s}", .{chars[0..len]}),
@@ -3056,11 +3096,11 @@ pub const ZigConsoleClient = struct {
             return;
         }
 
-        const id = std.hash.Wyhash.hash(0, chars[0..len]);
+        const id = bun.hash(chars[0..len]);
         var value: std.time.Timer = (pending_time_logs.get(id) orelse return) orelse return;
         // get the duration in microseconds
         // then display it in milliseconds
-        Output.printElapsed(@intToFloat(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
+        Output.printElapsed(@floatFromInt(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
         switch (len) {
             0 => Output.printErrorln("\n", .{}),
             else => Output.printErrorln(" {s}", .{chars[0..len]}),
diff --git a/src/bun.js/bindings/generated_classes.zig b/src/bun.js/bindings/generated_classes.zig
index 0ec65a469..171bba792 100644
--- a/src/bun.js/bindings/generated_classes.zig
+++ b/src/bun.js/bindings/generated_classes.zig
@@ -108,6 +108,8 @@ pub const JSBlob = struct {
 
         if (@TypeOf(Blob.getArrayBuffer) != CallbackType)
             @compileLog("Expected Blob.getArrayBuffer to be a callback but received " ++ @typeName(@TypeOf(Blob.getArrayBuffer)));
+        if (@TypeOf(Blob.getExists) != CallbackType)
+            @compileLog("Expected Blob.getExists to be a callback but received " ++ @typeName(@TypeOf(Blob.getExists)));
         if (@TypeOf(Blob.getFormData) != CallbackType)
             @compileLog("Expected Blob.getFormData to be a callback but received " ++ @typeName(@TypeOf(Blob.getFormData)));
         if (@TypeOf(Blob.getJSON) != CallbackType)
@@ -136,6 +138,7 @@ pub const JSBlob = struct {
             @export(Blob.constructor, .{ .name = "BlobClass__construct" });
             @export(Blob.finalize, .{ .name = "BlobClass__finalize" });
             @export(Blob.getArrayBuffer, .{ .name = "BlobPrototype__getArrayBuffer" });
+            @export(Blob.getExists, .{ .name = "BlobPrototype__getExists" });
             @export(Blob.getFormData, .{ .name = "BlobPrototype__getFormData" });
             @export(Blob.getJSON, .{ .name = "BlobPrototype__getJSON" });
             @export(Blob.getLastModified, .{ .name = "BlobPrototype__getLastModified" });
@@ -1406,6 +1409,97 @@ pub const JSExpectStringMatching = struct {
         }
     }
 };
+pub const JSFSWatcher = struct {
+    const FSWatcher = Classes.FSWatcher;
+    const GetterType = fn (*FSWatcher, *JSC.JSGlobalObject) callconv(.C) JSC.JSValue;
+    const GetterTypeWithThisValue = fn (*FSWatcher, JSC.JSValue, *JSC.JSGlobalObject) callconv(.C) JSC.JSValue;
+    const SetterType = fn (*FSWatcher, *JSC.JSGlobalObject, JSC.JSValue) callconv(.C) bool;
+    const SetterTypeWithThisValue = fn (*FSWatcher, JSC.JSValue, *JSC.JSGlobalObject, JSC.JSValue) callconv(.C) bool;
+    const CallbackType = fn (*FSWatcher, *JSC.JSGlobalObject, *JSC.CallFrame) callconv(.C) JSC.JSValue;
+
+    /// Return the pointer to the wrapped object.
+    /// If the object does not match the type, return null.
+    pub fn fromJS(value: JSC.JSValue) ?*FSWatcher {
+        JSC.markBinding(@src());
+        return FSWatcher__fromJS(value);
+    }
+
+    extern fn FSWatcherPrototype__listenerSetCachedValue(JSC.JSValue, *JSC.JSGlobalObject, JSC.JSValue) void;
+
+    extern fn FSWatcherPrototype__listenerGetCachedValue(JSC.JSValue) JSC.JSValue;
+
+    /// `FSWatcher.listener` setter
+    /// This value will be visited by the garbage collector.
+    pub fn listenerSetCached(thisValue: JSC.JSValue, globalObject: *JSC.JSGlobalObject, value: JSC.JSValue) void {
+        JSC.markBinding(@src());
+        FSWatcherPrototype__listenerSetCachedValue(thisValue, globalObject, value);
+    }
+
+    /// `FSWatcher.listener` getter
+    /// This value will be visited by the garbage collector.
+    pub fn listenerGetCached(thisValue: JSC.JSValue) ?JSC.JSValue {
+        JSC.markBinding(@src());
+        const result = FSWatcherPrototype__listenerGetCachedValue(thisValue);
+        if (result == .zero)
+            return null;
+
+        return result;
+    }
+
+    /// Create a new instance of FSWatcher
+    pub fn toJS(this: *FSWatcher, globalObject: *JSC.JSGlobalObject) JSC.JSValue {
+        JSC.markBinding(@src());
+        if (comptime Environment.allow_assert) {
+            const value__ = FSWatcher__create(globalObject, this);
+            std.debug.assert(value__.as(FSWatcher).? == this); // If this fails, likely a C ABI issue.
+            return value__;
+        } else {
+            return FSWatcher__create(globalObject, this);
+        }
+    }
+
+    /// Modify the internal ptr to point to a new instance of FSWatcher.
+    pub fn dangerouslySetPtr(value: JSC.JSValue, ptr: ?*FSWatcher) bool {
+        JSC.markBinding(@src());
+        return FSWatcher__dangerouslySetPtr(value, ptr);
+    }
+
+    /// Detach the ptr from the thisValue
+    pub fn detachPtr(_: *FSWatcher, value: JSC.JSValue) void {
+        JSC.markBinding(@src());
+        std.debug.assert(FSWatcher__dangerouslySetPtr(value, null));
+    }
+
+    extern fn FSWatcher__fromJS(JSC.JSValue) ?*FSWatcher;
+    extern fn FSWatcher__getConstructor(*JSC.JSGlobalObject) JSC.JSValue;
+
+    extern fn FSWatcher__create(globalObject: *JSC.JSGlobalObject, ptr: ?*FSWatcher) JSC.JSValue;
+
+    extern fn FSWatcher__dangerouslySetPtr(JSC.JSValue, ?*FSWatcher) bool;
+
+    comptime {
+        if (@TypeOf(FSWatcher.finalize) != (fn (*FSWatcher) callconv(.C) void)) {
+            @compileLog("FSWatcher.finalize is not a finalizer");
+        }
+
+        if (@TypeOf(FSWatcher.doClose) != CallbackType)
+            @compileLog("Expected FSWatcher.doClose to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.doClose)));
+        if (@TypeOf(FSWatcher.hasRef) != CallbackType)
+            @compileLog("Expected FSWatcher.hasRef to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.hasRef)));
+        if (@TypeOf(FSWatcher.doRef) != CallbackType)
+            @compileLog("Expected FSWatcher.doRef to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.doRef)));
+        if (@TypeOf(FSWatcher.doUnref) != CallbackType)
+            @compileLog("Expected FSWatcher.doUnref to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.doUnref)));
+        if (!JSC.is_bindgen) {
+            @export(FSWatcher.doClose, .{ .name = "FSWatcherPrototype__doClose" });
+            @export(FSWatcher.doRef, .{ .name = "FSWatcherPrototype__doRef" });
+            @export(FSWatcher.doUnref, .{ .name = "FSWatcherPrototype__doUnref" });
+            @export(FSWatcher.finalize, .{ .name = "FSWatcherClass__finalize" });
+            @export(FSWatcher.hasPendingActivity, .{ .name = "FSWatcher__hasPendingActivity" });
+            @export(FSWatcher.hasRef, .{ .name = "FSWatcherPrototype__hasRef" });
+        }
+    }
+};
 pub const JSFileSystemRouter = struct {
     const FileSystemRouter = Classes.FileSystemRouter;
     const GetterType = fn (*FileSystemRouter, *JSC.JSGlobalObject) callconv(.C) JSC.JSValue;
@@ -2312,6 +2406,8 @@ pub const JSNodeJSFS = struct {
             @compileLog("Expected NodeJSFS.utimes to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.utimes)));
         if (@TypeOf(NodeJSFS.utimesSync) != CallbackType)
             @compileLog("Expected NodeJSFS.utimesSync to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.utimesSync)));
+        if (@TypeOf(NodeJSFS.watch) != CallbackType)
+            @compileLog("Expected NodeJSFS.watch to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.watch)));
         if (@TypeOf(NodeJSFS.write) != CallbackType)
             @compileLog("Expected NodeJSFS.write to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.write)));
         if (@TypeOf(NodeJSFS.writeFile) != CallbackType)
@@ -2402,6 +2498,7 @@ pub const JSNodeJSFS = struct {
             @export(NodeJSFS.unlinkSync, .{ .name = "NodeJSFSPrototype__unlinkSync" });
             @export(NodeJSFS.utimes, .{ .name = "NodeJSFSPrototype__utimes" });
             @export(NodeJSFS.utimesSync, .{ .name = "NodeJSFSPrototype__utimesSync" });
+            @export(NodeJSFS.watch, .{ .name = "NodeJSFSPrototype__watch" });
             @export(NodeJSFS.write, .{ .name = "NodeJSFSPrototype__write" });
             @export(NodeJSFS.writeFile, .{ .name = "NodeJSFSPrototype__writeFile" });
             @export(NodeJSFS.writeFileSync, .{ .name = "NodeJSFSPrototype__writeFileSync" });
@@ -4329,6 +4426,9 @@ pub const JSTCPSocket = struct {
             @compileLog("TCPSocket.finalize is not a finalizer");
         }
 
+        if (@TypeOf(TCPSocket.getALPNProtocol) != GetterType)
+            @compileLog("Expected TCPSocket.getALPNProtocol to be a getter");
+
         if (@TypeOf(TCPSocket.getAuthorized) != GetterType)
             @compileLog("Expected TCPSocket.getAuthorized to be a getter");
 
@@ -4359,18 +4459,23 @@ pub const JSTCPSocket = struct {
         if (@TypeOf(TCPSocket.getRemoteAddress) != GetterType)
             @compileLog("Expected TCPSocket.getRemoteAddress to be a getter");
 
+        if (@TypeOf(TCPSocket.setServername) != CallbackType)
+            @compileLog("Expected TCPSocket.setServername to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.setServername)));
         if (@TypeOf(TCPSocket.shutdown) != CallbackType)
             @compileLog("Expected TCPSocket.shutdown to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.shutdown)));
         if (@TypeOf(TCPSocket.timeout) != CallbackType)
             @compileLog("Expected TCPSocket.timeout to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.timeout)));
         if (@TypeOf(TCPSocket.unref) != CallbackType)
             @compileLog("Expected TCPSocket.unref to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.unref)));
+        if (@TypeOf(TCPSocket.upgradeTLS) != CallbackType)
+            @compileLog("Expected TCPSocket.upgradeTLS to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.upgradeTLS)));
         if (@TypeOf(TCPSocket.write) != CallbackType)
             @compileLog("Expected TCPSocket.write to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.write)));
         if (!JSC.is_bindgen) {
             @export(TCPSocket.end, .{ .name = "TCPSocketPrototype__end" });
             @export(TCPSocket.finalize, .{ .name = "TCPSocketClass__finalize" });
             @export(TCPSocket.flush, .{ .name = "TCPSocketPrototype__flush" });
+            @export(TCPSocket.getALPNProtocol, .{ .name = "TCPSocketPrototype__getALPNProtocol" });
             @export(TCPSocket.getAuthorizationError, .{ .name = "TCPSocketPrototype__getAuthorizationError" });
             @export(TCPSocket.getAuthorized, .{ .name = "TCPSocketPrototype__getAuthorized" });
             @export(TCPSocket.getData, .{ .name = "TCPSocketPrototype__getData" });
@@ -4382,9 +4487,11 @@ pub const JSTCPSocket = struct {
             @export(TCPSocket.ref, .{ .name = "TCPSocketPrototype__ref" });
             @export(TCPSocket.reload, .{ .name = "TCPSocketPrototype__reload" });
             @export(TCPSocket.setData, .{ .name = "TCPSocketPrototype__setData" });
+            @export(TCPSocket.setServername, .{ .name = "TCPSocketPrototype__setServername" });
             @export(TCPSocket.shutdown, .{ .name = "TCPSocketPrototype__shutdown" });
             @export(TCPSocket.timeout, .{ .name = "TCPSocketPrototype__timeout" });
             @export(TCPSocket.unref, .{ .name = "TCPSocketPrototype__unref" });
+            @export(TCPSocket.upgradeTLS, .{ .name = "TCPSocketPrototype__upgradeTLS" });
             @export(TCPSocket.write, .{ .name = "TCPSocketPrototype__write" });
         }
     }
@@ -4484,6 +4591,9 @@ pub const JSTLSSocket = struct {
             @compileLog("TLSSocket.finalize is not a finalizer");
         }
 
+        if (@TypeOf(TLSSocket.getALPNProtocol) != GetterType)
+            @compileLog("Expected TLSSocket.getALPNProtocol to be a getter");
+
         if (@TypeOf(TLSSocket.getAuthorized) != GetterType)
             @compileLog("Expected TLSSocket.getAuthorized to be a getter");
 
@@ -4514,18 +4624,23 @@ pub const JSTLSSocket = struct {
         if (@TypeOf(TLSSocket.getRemoteAddress) != GetterType)
             @compileLog("Expected TLSSocket.getRemoteAddress to be a getter");
 
+        if (@TypeOf(TLSSocket.setServername) != CallbackType)
+            @compileLog("Expected TLSSocket.setServername to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.setServername)));
         if (@TypeOf(TLSSocket.shutdown) != CallbackType)
             @compileLog("Expected TLSSocket.shutdown to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.shutdown)));
         if (@TypeOf(TLSSocket.timeout) != CallbackType)
             @compileLog("Expected TLSSocket.timeout to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.timeout)));
         if (@TypeOf(TLSSocket.unref) != CallbackType)
             @compileLog("Expected TLSSocket.unref to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.unref)));
+        if (@TypeOf(TLSSocket.upgradeTLS) != CallbackType)
+            @compileLog("Expected TLSSocket.upgradeTLS to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.upgradeTLS)));
         if (@TypeOf(TLSSocket.write) != CallbackType)
             @compileLog("Expected TLSSocket.write to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.write)));
         if (!JSC.is_bindgen) {
             @export(TLSSocket.end, .{ .name = "TLSSocketPrototype__end" });
             @export(TLSSocket.finalize, .{ .name = "TLSSocketClass__finalize" });
             @export(TLSSocket.flush, .{ .name = "TLSSocketPrototype__flush" });
+            @export(TLSSocket.getALPNProtocol, .{ .name = "TLSSocketPrototype__getALPNProtocol" });
             @export(TLSSocket.getAuthorizationError, .{ .name = "TLSSocketPrototype__getAuthorizationError" });
             @export(TLSSocket.getAuthorized, .{ .name = "TLSSocketPrototype__getAuthorized" });
             @export(TLSSocket.getData, .{ .name = "TLSSocketPrototype__getData" });
@@ -4537,9 +4652,11 @@ pub const JSTLSSocket = struct {
             @export(TLSSocket.ref, .{ .name = "TLSSocketPrototype__ref" });
             @export(TLSSocket.reload, .{ .name = "TLSSocketPrototype__reload" });
             @export(TLSSocket.setData, .{ .name = "TLSSocketPrototype__setData" });
+            @export(TLSSocket.setServername, .{ .name = "TLSSocketPrototype__setServername" });
             @export(TLSSocket.shutdown, .{ .name = "TLSSocketPrototype__shutdown" });
             @export(TLSSocket.timeout, .{ .name = "TLSSocketPrototype__timeout" });
             @export(TLSSocket.unref, .{ .name = "TLSSocketPrototype__unref" });
+            @export(TLSSocket.upgradeTLS, .{ .name = "TLSSocketPrototype__upgradeTLS" });
             @export(TLSSocket.write, .{ .name = "TLSSocketPrototype__write" });
         }
     }
@@ -4855,6 +4972,7 @@ comptime {
     _ = JSExpectAnything;
     _ = JSExpectStringContaining;
     _ = JSExpectStringMatching;
+    _ = JSFSWatcher;
     _ = JSFileSystemRouter;
     _ = JSListener;
     _ = JSMD4;
diff --git a/src/bun.js/bindings/generated_classes_list.zig b/src/bun.js/bindings/generated_classes_list.zig
index d5d987dce..543d492b5 100644
--- a/src/bun.js/bindings/generated_classes_list.zig
+++ b/src/bun.js/bindings/generated_classes_list.zig
@@ -4,11 +4,11 @@ pub const Classes = struct {
     pub const Blob = JSC.WebCore.Blob;
     pub const CryptoHasher = JSC.API.Bun.Crypto.CryptoHasher;
     pub const Dirent = JSC.Node.Dirent;
-    pub const Expect = JSC.Jest.Expect;
-    pub const ExpectAny = JSC.Jest.ExpectAny;
-    pub const ExpectAnything = JSC.Jest.ExpectAnything;
-    pub const ExpectStringContaining = JSC.Jest.ExpectStringContaining;
-    pub const ExpectStringMatching = JSC.Jest.ExpectStringMatching;
+    pub const Expect = JSC.Expect.Expect;
+    pub const ExpectAny = JSC.Expect.ExpectAny;
+    pub const ExpectAnything = JSC.Expect.ExpectAnything;
+    pub const ExpectStringContaining = JSC.Expect.ExpectStringContaining;
+    pub const ExpectStringMatching = JSC.Expect.ExpectStringMatching;
     pub const FileSystemRouter = JSC.API.FileSystemRouter;
     pub const Bundler = JSC.API.JSBundler;
     pub const JSBundler = Bundler;
@@ -37,4 +37,5 @@ pub const Classes = struct {
     pub const BuildArtifact = JSC.API.BuildArtifact;
     pub const BuildMessage = JSC.BuildMessage;
     pub const ResolveMessage = JSC.ResolveMessage;
+    pub const FSWatcher = JSC.Node.FSWatcher;
 };
diff --git a/src/bun.js/bindings/header-gen.zig b/src/bun.js/bindings/header-gen.zig
index 089506a8f..eb0de1c09 100644
--- a/src/bun.js/bindings/header-gen.zig
+++ b/src/bun.js/bindings/header-gen.zig
@@ -807,7 +807,7 @@ pub fn HeaderGen(comptime first_import: type, comptime second_import: type, comp
                                                 }
                                             };
                                             var extern_list = Type.Extern;
-                                            std.sort.sort([]const u8, &extern_list, Sorder{}, Sorder.lessThan);
+                                            std.sort.block([]const u8, &extern_list, Sorder{}, Sorder.lessThan);
                                             break :brk extern_list;
                                         };
                                         // impl_writer.print("  #include {s}\n", .{Type.include}) catch unreachable;
@@ -840,7 +840,7 @@ pub fn HeaderGen(comptime first_import: type, comptime second_import: type, comp
                                                 }
                                             };
                                             var extern_list = Type.Export;
-                                            std.sort.sort(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
+                                            std.sort.block(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
                                             break :brk extern_list;
                                         };
 
@@ -867,7 +867,7 @@ pub fn HeaderGen(comptime first_import: type, comptime second_import: type, comp
                                     //             }
                                     //         };
                                     //         var extern_list = Type.lazy_static_functions;
-                                    //         std.sort.sort(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
+                                    //         std.sort.block(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
                                     //         break :brk extern_list;
                                     //     };
 
diff --git a/src/bun.js/bindings/headers-handwritten.h b/src/bun.js/bindings/headers-handwritten.h
index 57940550f..90c8f86d2 100644
--- a/src/bun.js/bindings/headers-handwritten.h
+++ b/src/bun.js/bindings/headers-handwritten.h
@@ -72,6 +72,7 @@ typedef struct ResolvedSource {
     void* allocator;
     uint64_t tag;
 } ResolvedSource;
+static const uint64_t ResolvedSourceTagPackageJSONTypeModule = 1;
 typedef union ErrorableResolvedSourceResult {
     ResolvedSource value;
     ZigErrorType err;
@@ -83,10 +84,10 @@ typedef struct ErrorableResolvedSource {
 
 typedef struct SystemError {
     int errno_;
-    ZigString code;
-    ZigString message;
-    ZigString path;
-    ZigString syscall;
+    BunString code;
+    BunString message;
+    BunString path;
+    BunString syscall;
     int fd;
 } SystemError;
 
@@ -119,15 +120,15 @@ typedef struct ZigStackFramePosition {
 } ZigStackFramePosition;
 
 typedef struct ZigStackFrame {
-    ZigString function_name;
-    ZigString source_url;
+    BunString function_name;
+    BunString source_url;
     ZigStackFramePosition position;
     ZigStackFrameCode code_type;
     bool remapped;
 } ZigStackFrame;
 
 typedef struct ZigStackTrace {
-    ZigString* source_lines_ptr;
+    BunString* source_lines_ptr;
     int32_t* source_lines_numbers;
     uint8_t source_lines_len;
     uint8_t source_lines_to_collect;
@@ -139,11 +140,11 @@ typedef struct ZigException {
     unsigned char code;
     uint16_t runtime_type;
     int errno_;
-    ZigString syscall;
-    ZigString code_;
-    ZigString path;
-    ZigString name;
-    ZigString message;
+    BunString syscall;
+    BunString code_;
+    BunString path;
+    BunString name;
+    BunString message;
     ZigStackTrace stack;
     void* exception;
     bool remapped;
@@ -245,6 +246,10 @@ BunString toString(WTF::String& wtfString);
 BunString toString(const WTF::String& wtfString);
 BunString toString(WTF::StringImpl* wtfString);
 
+BunString toStringRef(JSC::JSGlobalObject* globalObject, JSC::JSValue value);
+BunString toStringRef(WTF::String& wtfString);
+BunString toStringRef(const WTF::String& wtfString);
+BunString toStringRef(WTF::StringImpl* wtfString);
 }
 
 using Uint8Array_alias = JSC::JSUint8Array;
diff --git a/src/bun.js/bindings/headers.h b/src/bun.js/bindings/headers.h
index cdf7e05f4..f507121f8 100644
--- a/src/bun.js/bindings/headers.h
+++ b/src/bun.js/bindings/headers.h
@@ -253,6 +253,7 @@ CPP_DECL JSC__JSPromise* JSC__JSPromise__resolvedPromise(JSC__JSGlobalObject* ar
 CPP_DECL JSC__JSValue JSC__JSPromise__resolvedPromiseValue(JSC__JSGlobalObject* arg0, JSC__JSValue JSValue1);
 CPP_DECL void JSC__JSPromise__resolveOnNextTick(JSC__JSPromise* arg0, JSC__JSGlobalObject* arg1, JSC__JSValue JSValue2);
 CPP_DECL JSC__JSValue JSC__JSPromise__result(JSC__JSPromise* arg0, JSC__VM* arg1);
+CPP_DECL void JSC__JSPromise__setHandled(JSC__JSPromise* arg0, JSC__VM* arg1);
 CPP_DECL uint32_t JSC__JSPromise__status(const JSC__JSPromise* arg0, JSC__VM* arg1);
 
 #pragma mark - JSC::JSInternalPromise
@@ -267,6 +268,7 @@ CPP_DECL void JSC__JSInternalPromise__rejectWithCaughtException(JSC__JSInternalP
 CPP_DECL void JSC__JSInternalPromise__resolve(JSC__JSInternalPromise* arg0, JSC__JSGlobalObject* arg1, JSC__JSValue JSValue2);
 CPP_DECL JSC__JSInternalPromise* JSC__JSInternalPromise__resolvedPromise(JSC__JSGlobalObject* arg0, JSC__JSValue JSValue1);
 CPP_DECL JSC__JSValue JSC__JSInternalPromise__result(const JSC__JSInternalPromise* arg0, JSC__VM* arg1);
+CPP_DECL void JSC__JSInternalPromise__setHandled(JSC__JSInternalPromise* arg0, JSC__VM* arg1);
 CPP_DECL uint32_t JSC__JSInternalPromise__status(const JSC__JSInternalPromise* arg0, JSC__VM* arg1);
 
 #pragma mark - JSC::JSFunction
diff --git a/src/bun.js/bindings/headers.zig b/src/bun.js/bindings/headers.zig
index 4dda5f30b..666369b21 100644
--- a/src/bun.js/bindings/headers.zig
+++ b/src/bun.js/bindings/headers.zig
@@ -168,6 +168,7 @@ pub extern fn JSC__JSPromise__resolvedPromise(arg0: *bindings.JSGlobalObject, JS
 pub extern fn JSC__JSPromise__resolvedPromiseValue(arg0: *bindings.JSGlobalObject, JSValue1: JSC__JSValue) JSC__JSValue;
 pub extern fn JSC__JSPromise__resolveOnNextTick(arg0: ?*bindings.JSPromise, arg1: *bindings.JSGlobalObject, JSValue2: JSC__JSValue) void;
 pub extern fn JSC__JSPromise__result(arg0: ?*bindings.JSPromise, arg1: *bindings.VM) JSC__JSValue;
+pub extern fn JSC__JSPromise__setHandled(arg0: ?*bindings.JSPromise, arg1: *bindings.VM) void;
 pub extern fn JSC__JSPromise__status(arg0: [*c]const JSC__JSPromise, arg1: *bindings.VM) u32;
 pub extern fn JSC__JSInternalPromise__create(arg0: *bindings.JSGlobalObject) [*c]bindings.JSInternalPromise;
 pub extern fn JSC__JSInternalPromise__isHandled(arg0: [*c]const JSC__JSInternalPromise, arg1: *bindings.VM) bool;
@@ -179,6 +180,7 @@ pub extern fn JSC__JSInternalPromise__rejectWithCaughtException(arg0: [*c]bindin
 pub extern fn JSC__JSInternalPromise__resolve(arg0: [*c]bindings.JSInternalPromise, arg1: *bindings.JSGlobalObject, JSValue2: JSC__JSValue) void;
 pub extern fn JSC__JSInternalPromise__resolvedPromise(arg0: *bindings.JSGlobalObject, JSValue1: JSC__JSValue) [*c]bindings.JSInternalPromise;
 pub extern fn JSC__JSInternalPromise__result(arg0: [*c]const JSC__JSInternalPromise, arg1: *bindings.VM) JSC__JSValue;
+pub extern fn JSC__JSInternalPromise__setHandled(arg0: [*c]bindings.JSInternalPromise, arg1: *bindings.VM) void;
 pub extern fn JSC__JSInternalPromise__status(arg0: [*c]const JSC__JSInternalPromise, arg1: *bindings.VM) u32;
 pub extern fn JSC__JSFunction__optimizeSoon(JSValue0: JSC__JSValue) void;
 pub extern fn JSC__JSGlobalObject__bunVM(arg0: *bindings.JSGlobalObject) ?*bindings.VirtualMachine;
diff --git a/src/bun.js/bindings/helpers.h b/src/bun.js/bindings/helpers.h
index 402807f3d..00777c304 100644
--- a/src/bun.js/bindings/helpers.h
+++ b/src/bun.js/bindings/helpers.h
@@ -342,10 +342,10 @@ static const WTF::String toStringStatic(ZigString str)
     }
 
     if (isTaggedUTF16Ptr(str.ptr)) {
-        return WTF::String(WTF::ExternalStringImpl::createStatic(reinterpret_cast<const UChar*>(untag(str.ptr)), str.len));
+        return WTF::String(AtomStringImpl::add(reinterpret_cast<const UChar*>(untag(str.ptr)), str.len));
     }
 
-    return WTF::String(WTF::ExternalStringImpl::createStatic(
+    return WTF::String(AtomStringImpl::add(
         reinterpret_cast<const LChar*>(untag(str.ptr)), str.len));
 }
 
diff --git a/src/bun.js/bindings/napi.cpp b/src/bun.js/bindings/napi.cpp
index a859e3ac5..8fffcc05f 100644
--- a/src/bun.js/bindings/napi.cpp
+++ b/src/bun.js/bindings/napi.cpp
@@ -14,7 +14,7 @@
 #include "wtf/text/StringView.h"
 #include "wtf/text/StringBuilder.h"
 #include "wtf/text/WTFString.h"
-
+#include "BufferEncodingType.h"
 #include "JavaScriptCore/AggregateError.h"
 #include "JavaScriptCore/BytecodeIndex.h"
 #include "JavaScriptCore/CallFrame.h"
@@ -554,7 +554,6 @@ extern "C" napi_status napi_wrap(napi_env env,
 
     auto* globalObject = toJS(env);
     auto& vm = globalObject->vm();
-    
 
     auto* val = jsDynamicCast<NapiPrototype*>(value);
 
@@ -572,7 +571,7 @@ extern "C" napi_status napi_wrap(napi_env env,
     auto clientData = WebCore::clientData(vm);
 
     auto* ref = new NapiRef(globalObject, 1);
-    ref->strongRef.set(globalObject->vm(), value.getObject());    
+    ref->strongRef.set(globalObject->vm(), value.getObject());
 
     if (finalize_cb) {
         ref->finalizer.finalize_cb = finalize_cb;
@@ -816,7 +815,7 @@ extern "C" napi_status napi_create_reference(napi_env env, napi_value value,
         }
     }
 
-    if(object) {
+    if (object) {
         object->napiRef = ref;
     }
 
@@ -1029,7 +1028,26 @@ extern "C" napi_status napi_create_type_error(napi_env env, napi_value code,
 
     auto error = JSC::createTypeError(globalObject, messageValue.toWTFString(globalObject));
     if (codeValue) {
-        error->putDirect(vm, Identifier::fromString(vm, "code"_s), codeValue, 0);
+        error->putDirect(vm, WebCore::builtinNames(vm).codePublicName(), codeValue, 0);
+    }
+
+    *result = reinterpret_cast<napi_value>(JSC::JSValue::encode(error));
+    return napi_ok;
+}
+
+extern "C" napi_status napi_create_error(napi_env env, napi_value code,
+    napi_value msg,
+    napi_value* result)
+{
+    Zig::GlobalObject* globalObject = toJS(env);
+    JSC::VM& vm = globalObject->vm();
+
+    JSC::JSValue codeValue = JSC::JSValue::decode(reinterpret_cast<JSC::EncodedJSValue>(code));
+    JSC::JSValue messageValue = JSC::JSValue::decode(reinterpret_cast<JSC::EncodedJSValue>(msg));
+
+    auto error = JSC::createError(globalObject, messageValue.toWTFString(globalObject));
+    if (codeValue) {
+        error->putDirect(vm, WebCore::builtinNames(vm).codePublicName(), codeValue, 0);
     }
 
     *result = reinterpret_cast<napi_value>(JSC::JSValue::encode(error));
@@ -1170,9 +1188,7 @@ void NapiClass::visitChildrenImpl(JSCell* cell, Visitor& visitor)
 
 DEFINE_VISIT_CHILDREN(NapiClass);
 
-static JSC_DECLARE_HOST_FUNCTION(NapiClass_ConstructorFunction);
-
-static JSC_DEFINE_HOST_FUNCTION(NapiClass_ConstructorFunction,
+JSC_DEFINE_HOST_FUNCTION(NapiClass_ConstructorFunction,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
@@ -1262,7 +1278,6 @@ NapiClass* NapiClass::create(VM& vm, Zig::GlobalObject* globalObject, const char
 {
     WTF::String name = WTF::String::fromUTF8(utf8name, length).isolatedCopy();
     NativeExecutable* executable = vm.getHostFunction(NapiClass_ConstructorFunction, ImplementationVisibility::Public, NapiClass_ConstructorFunction, name);
-
     Structure* structure = globalObject->NapiClassStructure();
     NapiClass* napiClass = new (NotNull, allocateCell<NapiClass>(vm)) NapiClass(vm, executable, globalObject, structure);
     napiClass->finishCreation(vm, executable, length, name, constructor, data, property_count, properties);
@@ -1474,7 +1489,85 @@ extern "C" napi_status napi_get_property_names(napi_env env, napi_value object,
     return napi_ok;
 }
 
-extern "C" napi_status napi_create_object(napi_env env, napi_value* result){
+extern "C" napi_status napi_get_value_string_utf8(napi_env env,
+    napi_value napiValue, char* buf,
+    size_t bufsize,
+    size_t* writtenPtr)
+{
+    JSGlobalObject* globalObject = toJS(env);
+    JSC::VM& vm = globalObject->vm();
+
+    JSValue jsValue = toJS(napiValue);
+    if (!jsValue || !jsValue.isString()) {
+        return napi_string_expected;
+    }
+
+    JSString* jsString = jsValue.toStringOrNull(globalObject);
+    if (UNLIKELY(!jsString)) {
+        return napi_generic_failure;
+    }
+
+    size_t length = jsString->length();
+    auto viewWithUnderlyingString = jsString->viewWithUnderlyingString(globalObject);
+    auto view = viewWithUnderlyingString.view;
+
+    if (buf == nullptr) {
+        if (writtenPtr != nullptr) {
+            if (view.is8Bit()) {
+                *writtenPtr = Bun__encoding__byteLengthLatin1(view.characters8(), length, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+            } else {
+                *writtenPtr = Bun__encoding__byteLengthUTF16(view.characters16(), length, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+            }
+        }
+
+        return napi_ok;
+    }
+
+    if (bufsize == NAPI_AUTO_LENGTH) {
+        bufsize = strlen(buf);
+    }
+
+    size_t written;
+    if (view.is8Bit()) {
+        written = Bun__encoding__writeLatin1(view.characters8(), view.length(), reinterpret_cast<unsigned char*>(buf), bufsize, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+    } else {
+        written = Bun__encoding__writeUTF16(view.characters16(), view.length(), reinterpret_cast<unsigned char*>(buf), bufsize, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+    }
+
+    if (writtenPtr != nullptr) {
+        *writtenPtr = written;
+    }
+
+    if (written < bufsize) {
+        buf[written] = '\0';
+    }
+
+    return napi_ok;
+}
+
+extern "C" napi_status napi_get_element(napi_env env, napi_value objectValue,
+    uint32_t index, napi_value* result)
+{
+    JSValue jsValue = toJS(objectValue);
+    if (!jsValue || !jsValue.isObject()) {
+        return napi_invalid_arg;
+    }
+
+    JSObject* object = jsValue.getObject();
+
+    auto scope = DECLARE_THROW_SCOPE(object->vm());
+    JSValue element = object->getIndex(toJS(env), index);
+    RETURN_IF_EXCEPTION(scope, napi_generic_failure);
+
+    if (result) {
+        *result = toNapi(element);
+    }
+
+    return napi_ok;
+}
+
+extern "C" napi_status napi_create_object(napi_env env, napi_value* result)
+{
 
     if (UNLIKELY(result == nullptr)) {
         return napi_invalid_arg;
@@ -1520,8 +1613,10 @@ extern "C" napi_status napi_typeof(napi_env env, napi_value val,
 
     JSC::JSValue value = toJS(val);
 
-    if (UNLIKELY(value.isEmpty())) {
-        return napi_invalid_arg;
+    if (value.isEmpty()) {
+        // This can happen
+        *result = napi_undefined;
+        return napi_ok;
     }
 
     if (value.isCell()) {
@@ -1560,17 +1655,18 @@ extern "C" napi_status napi_typeof(napi_env env, napi_value val,
             *result = napi_object;
             return napi_ok;
 
-        default:
-            if (cell->isObject()) {
-                *result = napi_object;
+        default: {
+            if (cell->isCallable() || cell->isConstructor()) {
+                *result = napi_function;
                 return napi_ok;
             }
 
-            if (cell->isCallable() || cell->isConstructor()) {
-                *result = napi_function;
+            if (cell->isObject()) {
+                *result = napi_object;
                 return napi_ok;
             }
         }
+        }
     }
 
     if (value.isNumber()) {
diff --git a/src/bun.js/bindings/node_util_types.cpp b/src/bun.js/bindings/node_util_types.cpp
index 0c75662cf..f7ae3949e 100644
--- a/src/bun.js/bindings/node_util_types.cpp
+++ b/src/bun.js/bindings/node_util_types.cpp
@@ -322,7 +322,7 @@ void generateNodeUtilTypesSourceCode(JSC::JSGlobalObject* lexicalGlobalObject,
 
     JSC::VM& vm = globalObject->vm();
 
-    JSC::JSObject* defaultObject = constructEmptyObject(globalObject, globalObject->objectPrototype(), 43);
+    JSC::JSObject* defaultObject = constructEmptyObject(globalObject, globalObject->objectPrototype(), 42);
     exportNames.reserveCapacity(43);
     exportValues.ensureCapacity(43);
 
diff --git a/src/bun.js/bindings/shimmer.zig b/src/bun.js/bindings/shimmer.zig
index a90bfab87..3a6242000 100644
--- a/src/bun.js/bindings/shimmer.zig
+++ b/src/bun.js/bindings/shimmer.zig
@@ -163,7 +163,7 @@ pub fn Shimmer(comptime _namespace: []const u8, comptime _name: []const u8, comp
             if (comptime isNullableType(ExpectedReturnType) != isNullableType(ExternReturnType)) {
                 return value.?;
             } else if (comptime (@typeInfo(ExpectedReturnType) == .Enum) and (@typeInfo(ExternReturnType) != .Enum)) {
-                return @intToEnum(ExpectedReturnType, value);
+                return @enumFromInt(ExpectedReturnType, value);
             } else {
                 return value;
             }
diff --git a/src/bun.js/bindings/simdutf.cpp b/src/bun.js/bindings/simdutf.cpp
index ea0d95f42..6d20bcf5e 100644
--- a/src/bun.js/bindings/simdutf.cpp
+++ b/src/bun.js/bindings/simdutf.cpp
@@ -1,8 +1,8 @@
-/* auto-generated on 2023-02-10 14:42:58 -0500. Do not edit! */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf.cpp
+/* auto-generated on 2023-06-21 08:09:45 -0400. Do not edit! */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf.cpp
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=implementation.cpp
 /* begin file src/implementation.cpp */
 #include <initializer_list>
 #include <climits>
@@ -11,22 +11,23 @@
 namespace simdutf {
 namespace {
 
-template <typename T>
-std::string toBinaryString(T b) {
-   std::string binary = "";
-   T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
-   while (mask > 0) {
-    binary += ((b & mask) == 0) ? '0' : '1';
-    mask >>= 1;
-  }
-  return binary;
+template<typename T>
+std::string toBinaryString(T b)
+{
+    std::string binary = "";
+    T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
+    while (mask > 0) {
+        binary += ((b & mask) == 0) ? '0' : '1';
+        mask >>= 1;
+    }
+    return binary;
 }
 }
 }
 
 // Implementations
 // The best choice should always come first!
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64.h
 /* begin file src/simdutf/arm64.h */
 #ifndef SIMDUTF_ARM64_H
 #define SIMDUTF_ARM64_H
@@ -35,13 +36,10 @@ std::string toBinaryString(T b) {
 #error "arm64.h must be included before fallback.h"
 #endif
 
-
 #ifndef SIMDUTF_IMPLEMENTATION_ARM64
 #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
 #endif
-#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
-
-
+#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64&& SIMDUTF_IS_ARM64
 
 #if SIMDUTF_IMPLEMENTATION_ARM64
 
@@ -53,12 +51,11 @@ namespace arm64 {
 } // namespace arm64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/implementation.h
 /* begin file src/simdutf/arm64/implementation.h */
 #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
 #define SIMDUTF_ARM64_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace arm64 {
 
@@ -68,60 +65,85 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace arm64
@@ -130,26 +152,25 @@ public:
 #endif // SIMDUTF_ARM64_IMPLEMENTATION_H
 /* end file src/simdutf/arm64/implementation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/begin.h
 /* begin file src/simdutf/arm64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
 // #define SIMDUTF_IMPLEMENTATION arm64
 /* end file src/simdutf/arm64/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/intrinsics.h
 /* begin file src/simdutf/arm64/intrinsics.h */
 #ifndef SIMDUTF_ARM64_INTRINSICS_H
 #define SIMDUTF_ARM64_INTRINSICS_H
 
-
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <arm_neon.h>
 
 #endif //  SIMDUTF_ARM64_INTRINSICS_H
 /* end file src/simdutf/arm64/intrinsics.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
 /* begin file src/simdutf/arm64/bitmanipulation.h */
 #ifndef SIMDUTF_ARM64_BITMANIPULATION_H
 #define SIMDUTF_ARM64_BITMANIPULATION_H
@@ -159,8 +180,9 @@ namespace arm64 {
 namespace {
 
 /* result might be undefined when input_num is zero */
-simdutf_really_inline int count_ones(uint64_t input_num) {
-   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+simdutf_really_inline int count_ones(uint64_t input_num)
+{
+    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
 }
 
 } // unnamed namespace
@@ -169,14 +191,13 @@ simdutf_really_inline int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_ARM64_BITMANIPULATION_H
 /* end file src/simdutf/arm64/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/simd.h
 /* begin file src/simdutf/arm64/simd.h */
 #ifndef SIMDUTF_ARM64_SIMD_H
 #define SIMDUTF_ARM64_SIMD_H
 
 #include <type_traits>
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
@@ -186,7 +207,6 @@ namespace simd {
 namespace {
 // Start of private section with Visual Studio workaround
 
-
 /**
  * make_uint8x16_t initializes a SIMD register (uint8x16_t).
  * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
@@ -198,130 +218,138 @@ namespace {
  * You should not use this function except for compile-time constants:
  * it is not efficient.
  */
-simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
-                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
-                                         uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
-                                         uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
-  // Doing a load like so end ups generating worse code.
-  // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
-  //                     x9, x10,x11,x12,x13,x14,x15,x16};
-  // return vld1q_u8(array);
-  uint8x16_t x{};
-  // incredibly, Visual Studio does not allow x[0] = x1
-  x = vsetq_lane_u8(x1, x, 0);
-  x = vsetq_lane_u8(x2, x, 1);
-  x = vsetq_lane_u8(x3, x, 2);
-  x = vsetq_lane_u8(x4, x, 3);
-  x = vsetq_lane_u8(x5, x, 4);
-  x = vsetq_lane_u8(x6, x, 5);
-  x = vsetq_lane_u8(x7, x, 6);
-  x = vsetq_lane_u8(x8, x, 7);
-  x = vsetq_lane_u8(x9, x, 8);
-  x = vsetq_lane_u8(x10, x, 9);
-  x = vsetq_lane_u8(x11, x, 10);
-  x = vsetq_lane_u8(x12, x, 11);
-  x = vsetq_lane_u8(x13, x, 12);
-  x = vsetq_lane_u8(x14, x, 13);
-  x = vsetq_lane_u8(x15, x, 14);
-  x = vsetq_lane_u8(x16, x, 15);
-  return x;
+simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
+    uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
+    uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
+    uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16)
+{
+    // Doing a load like so end ups generating worse code.
+    // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+    //                     x9, x10,x11,x12,x13,x14,x15,x16};
+    // return vld1q_u8(array);
+    uint8x16_t x {};
+    // incredibly, Visual Studio does not allow x[0] = x1
+    x = vsetq_lane_u8(x1, x, 0);
+    x = vsetq_lane_u8(x2, x, 1);
+    x = vsetq_lane_u8(x3, x, 2);
+    x = vsetq_lane_u8(x4, x, 3);
+    x = vsetq_lane_u8(x5, x, 4);
+    x = vsetq_lane_u8(x6, x, 5);
+    x = vsetq_lane_u8(x7, x, 6);
+    x = vsetq_lane_u8(x8, x, 7);
+    x = vsetq_lane_u8(x9, x, 8);
+    x = vsetq_lane_u8(x10, x, 9);
+    x = vsetq_lane_u8(x11, x, 10);
+    x = vsetq_lane_u8(x12, x, 11);
+    x = vsetq_lane_u8(x13, x, 12);
+    x = vsetq_lane_u8(x14, x, 13);
+    x = vsetq_lane_u8(x15, x, 14);
+    x = vsetq_lane_u8(x16, x, 15);
+    return x;
 }
 
 // We have to do the same work for make_int8x16_t
-simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
-                                       int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
-                                       int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
-                                       int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
-  // Doing a load like so end ups generating worse code.
-  // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
-  //                     x9, x10,x11,x12,x13,x14,x15,x16};
-  // return vld1q_s8(array);
-  int8x16_t x{};
-  // incredibly, Visual Studio does not allow x[0] = x1
-  x = vsetq_lane_s8(x1, x, 0);
-  x = vsetq_lane_s8(x2, x, 1);
-  x = vsetq_lane_s8(x3, x, 2);
-  x = vsetq_lane_s8(x4, x, 3);
-  x = vsetq_lane_s8(x5, x, 4);
-  x = vsetq_lane_s8(x6, x, 5);
-  x = vsetq_lane_s8(x7, x, 6);
-  x = vsetq_lane_s8(x8, x, 7);
-  x = vsetq_lane_s8(x9, x, 8);
-  x = vsetq_lane_s8(x10, x, 9);
-  x = vsetq_lane_s8(x11, x, 10);
-  x = vsetq_lane_s8(x12, x, 11);
-  x = vsetq_lane_s8(x13, x, 12);
-  x = vsetq_lane_s8(x14, x, 13);
-  x = vsetq_lane_s8(x15, x, 14);
-  x = vsetq_lane_s8(x16, x, 15);
-  return x;
-}
-
-simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
-                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8) {
-  uint8x8_t x{};
-  x = vset_lane_u8(x1, x, 0);
-  x = vset_lane_u8(x2, x, 1);
-  x = vset_lane_u8(x3, x, 2);
-  x = vset_lane_u8(x4, x, 3);
-  x = vset_lane_u8(x5, x, 4);
-  x = vset_lane_u8(x6, x, 5);
-  x = vset_lane_u8(x7, x, 6);
-  x = vset_lane_u8(x8, x, 7);
-  return x;
-}
-
-simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1,  uint16_t x2,  uint16_t x3,  uint16_t x4,
-                                       uint16_t x5,  uint16_t x6,  uint16_t x7,  uint16_t x8) {
-  uint16x8_t x{};
-  x = vsetq_lane_u16(x1, x, 0);
-  x = vsetq_lane_u16(x2, x, 1);
-  x = vsetq_lane_u16(x3, x, 2);
-  x = vsetq_lane_u16(x4, x, 3);
-  x = vsetq_lane_u16(x5, x, 4);
-  x = vsetq_lane_u16(x6, x, 5);
-  x = vsetq_lane_u16(x7, x, 6);
-  x = vsetq_lane_u16(x8, x, 7);;
-  return x;
-}
-
-simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t x3,  int16_t x4,
-                                       int16_t x5,  int16_t x6,  int16_t x7,  int16_t x8) {
-  uint16x8_t x{};
-  x = vsetq_lane_s16(x1, x, 0);
-  x = vsetq_lane_s16(x2, x, 1);
-  x = vsetq_lane_s16(x3, x, 2);
-  x = vsetq_lane_s16(x4, x, 3);
-  x = vsetq_lane_s16(x5, x, 4);
-  x = vsetq_lane_s16(x6, x, 5);
-  x = vsetq_lane_s16(x7, x, 6);
-  x = vsetq_lane_s16(x8, x, 7);;
-  return x;
+simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
+    int8_t x5, int8_t x6, int8_t x7, int8_t x8,
+    int8_t x9, int8_t x10, int8_t x11, int8_t x12,
+    int8_t x13, int8_t x14, int8_t x15, int8_t x16)
+{
+    // Doing a load like so end ups generating worse code.
+    // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+    //                     x9, x10,x11,x12,x13,x14,x15,x16};
+    // return vld1q_s8(array);
+    int8x16_t x {};
+    // incredibly, Visual Studio does not allow x[0] = x1
+    x = vsetq_lane_s8(x1, x, 0);
+    x = vsetq_lane_s8(x2, x, 1);
+    x = vsetq_lane_s8(x3, x, 2);
+    x = vsetq_lane_s8(x4, x, 3);
+    x = vsetq_lane_s8(x5, x, 4);
+    x = vsetq_lane_s8(x6, x, 5);
+    x = vsetq_lane_s8(x7, x, 6);
+    x = vsetq_lane_s8(x8, x, 7);
+    x = vsetq_lane_s8(x9, x, 8);
+    x = vsetq_lane_s8(x10, x, 9);
+    x = vsetq_lane_s8(x11, x, 10);
+    x = vsetq_lane_s8(x12, x, 11);
+    x = vsetq_lane_s8(x13, x, 12);
+    x = vsetq_lane_s8(x14, x, 13);
+    x = vsetq_lane_s8(x15, x, 14);
+    x = vsetq_lane_s8(x16, x, 15);
+    return x;
+}
+
+simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
+    uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8)
+{
+    uint8x8_t x {};
+    x = vset_lane_u8(x1, x, 0);
+    x = vset_lane_u8(x2, x, 1);
+    x = vset_lane_u8(x3, x, 2);
+    x = vset_lane_u8(x4, x, 3);
+    x = vset_lane_u8(x5, x, 4);
+    x = vset_lane_u8(x6, x, 5);
+    x = vset_lane_u8(x7, x, 6);
+    x = vset_lane_u8(x8, x, 7);
+    return x;
+}
+
+simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4,
+    uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8)
+{
+    uint16x8_t x {};
+    x = vsetq_lane_u16(x1, x, 0);
+    x = vsetq_lane_u16(x2, x, 1);
+    x = vsetq_lane_u16(x3, x, 2);
+    x = vsetq_lane_u16(x4, x, 3);
+    x = vsetq_lane_u16(x5, x, 4);
+    x = vsetq_lane_u16(x6, x, 5);
+    x = vsetq_lane_u16(x7, x, 6);
+    x = vsetq_lane_u16(x8, x, 7);
+    ;
+    return x;
+}
+
+simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4,
+    int16_t x5, int16_t x6, int16_t x7, int16_t x8)
+{
+    uint16x8_t x {};
+    x = vsetq_lane_s16(x1, x, 0);
+    x = vsetq_lane_s16(x2, x, 1);
+    x = vsetq_lane_s16(x3, x, 2);
+    x = vsetq_lane_s16(x4, x, 3);
+    x = vsetq_lane_s16(x5, x, 4);
+    x = vsetq_lane_s16(x6, x, 5);
+    x = vsetq_lane_s16(x7, x, 6);
+    x = vsetq_lane_s16(x8, x, 7);
+    ;
+    return x;
 }
 
-
 // End of private section with Visual Studio workaround
 } // namespace
 #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
 
+template<typename T>
+struct simd8;
 
-  template<typename T>
-  struct simd8;
-
-  //
-  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
-  //
-  template<typename T, typename Mask=simd8<bool>>
-  struct base_u8 {
+//
+// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+//
+template<typename T, typename Mask = simd8<bool>>
+struct base_u8 {
     uint8x16_t value;
     static const int SIZE = sizeof(value);
 
     // Conversion from/to SIMD register
-    simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
+    simdutf_really_inline base_u8(const uint8x16_t _value)
+        : value(_value)
+    {
+    }
     simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
     simdutf_really_inline operator uint8x16_t&() { return this->value; }
-    simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
-    simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
+    simdutf_really_inline T first() const { return vgetq_lane_u8(*this, 0); }
+    simdutf_really_inline T last() const { return vgetq_lane_u8(*this, 15); }
 
     // Bit operations
     simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
@@ -329,48 +357,74 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
     simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other)
+    {
+        auto this_cast = static_cast<simd8<T>*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other)
+    {
+        auto this_cast = static_cast<simd8<T>*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other)
+    {
+        auto this_cast = static_cast<simd8<T>*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
 
-    simdutf_really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
 
-    template<int N=1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return vextq_u8(prev_chunk, *this, 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+    {
+        return vextq_u8(prev_chunk, *this, 16 - N);
     }
-  };
+};
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base_u8<bool> {
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd8<bool> : base_u8<bool> {
     typedef uint16_t bitmask_t;
     typedef uint32_t bitmask2_t;
 
     static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
+    simdutf_really_inline simd8(const uint8x16_t _value)
+        : base_u8<bool>(_value)
+    {
+    }
     // False constructor
-    simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
+    simdutf_really_inline simd8()
+        : simd8(vdupq_n_u8(0))
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(bool _value)
+        : simd8(splat(_value))
+    {
+    }
     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
 
     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
     // purposes (cutting it down to uint16_t costs performance in some compilers).
-    simdutf_really_inline uint32_t to_bitmask() const {
+    simdutf_really_inline uint32_t to_bitmask() const
+    {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+        const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-      const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+        const uint8x16_t bit_mask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
 #endif
-      auto minput = *this & bit_mask;
-      uint8x16_t tmp = vpaddq_u8(minput, minput);
-      tmp = vpaddq_u8(tmp, tmp);
-      tmp = vpaddq_u8(tmp, tmp);
-      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+        auto minput = *this & bit_mask;
+        uint8x16_t tmp = vpaddq_u8(minput, minput);
+        tmp = vpaddq_u8(tmp, tmp);
+        tmp = vpaddq_u8(tmp, tmp);
+        return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
     }
 
     // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
@@ -378,58 +432,70 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // This method is expected to be faster than none() and is equivalent
     // when the vector register is the result of a comparison, with byte
     // values 0xff and 0x00.
-    simdutf_really_inline uint64_t to_bitmask64() const {
-      return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
+    simdutf_really_inline uint64_t to_bitmask64() const
+    {
+        return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
     }
 
     simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
     simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
     simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
+};
 
-
-  };
-
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base_u8<uint8_t> {
+// Unsigned bytes
+template<>
+struct simd8<uint8_t> : base_u8<uint8_t> {
     static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
     static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
     static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
-    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
+    simdutf_really_inline simd8(const uint8x16_t _value)
+        : base_u8<uint8_t>(_value)
+    {
+    }
     // Zero constructor
-    simdutf_really_inline simd8() : simd8(zero()) {}
+    simdutf_really_inline simd8()
+        : simd8(zero())
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint8_t values[16])
+        : simd8(load(values))
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Member-by-member initialization
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(make_uint8x16_t(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        : simd8(make_uint8x16_t(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
 #else
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(uint8x16_t{
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    }) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        : simd8(uint8x16_t {
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15 })
+    {
+    }
 #endif
 
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+    {
+        return simd8<uint8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
     // Store to array
@@ -442,8 +508,16 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
     simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
-    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
-    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
+    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other)
+    {
+        *this = *this - other;
+        return *this;
+    }
 
     // Order-specific operations
     simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
@@ -472,100 +546,116 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return lookup_table.apply_lookup_16_to(*this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return lookup_table.apply_lookup_16_to(*this);
     }
 
-
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
     }
 
     template<typename T>
-    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
-      return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const
+    {
+        return vqtbl1q_u8(*this, simd8<uint8_t>(original));
     }
-  };
+};
 
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> {
+// Signed bytes
+template<>
+struct simd8<int8_t> {
     int8x16_t value;
 
     static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
     static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
     static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
-    template <endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
-      uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)));
-      uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
-      if (!match_system(big_endian)) {
-        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        #else
-        const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-        #endif
-        first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
-        second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
-      }
-      vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
-      vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
-      vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))));
-      vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))));
-      vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
-      vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
+    template<endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* p) const
+    {
+        uint16x8_t first = vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value)));
+        uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
+        if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
+            second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
+        vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* p) const
+    {
+        vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value))))));
+        vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value)))));
+        vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
+        vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
     }
     // Conversion from/to SIMD register
-    simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
+    simdutf_really_inline simd8(const int8x16_t _value)
+        : value { _value }
+    {
+    }
     simdutf_really_inline operator const int8x16_t&() const { return this->value; }
     simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
     simdutf_really_inline operator int8x16_t&() { return this->value; }
 
     // Zero constructor
-    simdutf_really_inline simd8() : simd8(zero()) {}
+    simdutf_really_inline simd8()
+        : simd8(zero())
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const int8_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(make_int8x16_t(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8(make_int8x16_t(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
 #else
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(int8x16_t{
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    }) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8(int8x16_t {
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15 })
+    {
+    }
 #endif
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
     // Store to array
@@ -576,9 +666,15 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
     // and relatively ugly and hard to read.
 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
-    simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
+    simdutf_really_inline explicit simd8(const uint8x16_t other)
+        : simd8(vreinterpretq_s8_u8(other))
+    {
+    }
 #endif
-    simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
+    simdutf_really_inline operator simd8<uint8_t>() const
+    {
+        return vreinterpretq_u8_s8(this->value);
+    }
 
     simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
     simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
@@ -588,8 +684,16 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // Math
     simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
     simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
-    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
-    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
+    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other)
+    {
+        *this = *this - other;
+        return *this;
+    }
 
     simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
     simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
@@ -602,38 +706,41 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
     simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
 
-    template<int N=1>
-    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
-      return vextq_s8(prev_chunk, *this, 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const
+    {
+        return vextq_s8(prev_chunk, *this, 16 - N);
     }
 
     // Perform a lookup assuming no value is larger than 16
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return lookup_table.apply_lookup_16_to(*this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return lookup_table.apply_lookup_16_to(*this);
     }
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
     }
 
     template<typename T>
-    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
-      return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original)
+    {
+        return vqtbl1q_s8(*this, simd8<uint8_t>(original));
     }
-  };
+};
 
-  template<typename T>
-  struct simd8x64 {
+template<typename T>
+struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -642,159 +749,181 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
-
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
+    {
     }
 
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    }
 
-    simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      this->chunks[2] |= other.chunks[2];
-      this->chunks[3] |= other.chunks[3];
-      return *this;
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        this->chunks[2] |= other.chunks[2];
+        this->chunks[3] |= other.chunks[3];
+        return *this;
     }
 
-    simdutf_really_inline simd8<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return reduce_or().is_ascii();
     }
 
-    template <endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
+    template<endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 3);
     }
 
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t bit_mask = make_uint8x16_t(
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      );
+        const uint8x16_t bit_mask = make_uint8x16_t(
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-      const uint8x16_t bit_mask = {
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      };
+        const uint8x16_t bit_mask = {
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        };
 #endif
-      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-      uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
-      uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
-      sum0 = vpaddq_u8(sum0, sum1);
-      sum0 = vpaddq_u8(sum0, sum0);
-      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return  simd8x64<bool>(
-      this->chunks[0] == mask,
-      this->chunks[1] == mask,
-      this->chunks[2] == mask,
-      this->chunks[3] == mask
-    ).to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t lteq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return  simd8x64<bool>(
-      this->chunks[0] <= mask,
-      this->chunks[1] <= mask,
-      this->chunks[2] <= mask,
-      this->chunks[3] <= mask
-    ).to_bitmask();
-  }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-      return  simd8x64<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask,
-        this->chunks[2] > mask,
-        this->chunks[3] > mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask,
-        this->chunks[2] >= mask,
-        this->chunks[3] >= mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
-        simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
-        simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
-        simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
-      ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h
+        // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+        uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
+        uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
+        sum0 = vpaddq_u8(sum0, sum1);
+        sum0 = vpaddq_u8(sum0, sum0);
+        return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+        return simd8x64<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask,
+            this->chunks[2] > mask,
+            this->chunks[3] > mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask,
+            this->chunks[2] >= mask,
+            this->chunks[3] >= mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
+            simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
+            simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
+            simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask)
+            .to_bitmask();
+    }
+}; // struct simd8x64<T>
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/simd16-inl.h
 /* begin file src/simdutf/arm64/simd16-inl.h */
 template<typename T>
 struct simd16;
 
-  template<typename T, typename Mask=simd16<bool>>
-  struct base_u16 {
+template<typename T, typename Mask = simd16<bool>>
+struct base_u16 {
     uint16x8_t value;
     static const int SIZE = sizeof(value);
 
     // Conversion from/to SIMD register
     simdutf_really_inline base_u16() = default;
-    simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
+    simdutf_really_inline base_u16(const uint16x8_t _value)
+        : value(_value)
+    {
+    }
     simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
     simdutf_really_inline operator uint16x8_t&() { return this->value; }
     // Bit operations
@@ -803,167 +932,244 @@ struct simd16;
     simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
     simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
     simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other)
+    {
+        auto this_cast = static_cast<simd16<T>*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other)
+    {
+        auto this_cast = static_cast<simd16<T>*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other)
+    {
+        auto this_cast = static_cast<simd16<T>*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
 
-    simdutf_really_inline Mask operator==(const simd16<T> other) const { return vceqq_u16(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
 
-    template<int N=1>
-    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-      return vextq_u18(prev_chunk, *this, 8 - N);
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return vextq_u18(prev_chunk, *this, 8 - N);
     }
-  };
-
-template<typename T, typename Mask=simd16<bool>>
-struct base16: base_u16<T> {
-  typedef uint16_t bitmask_t;
-  typedef uint32_t bitmask2_t;
+};
 
-  simdutf_really_inline base16() : base_u16<T>() {}
-  simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
-  template <typename Pointer>
-  simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
+template<typename T, typename Mask = simd16<bool>>
+struct base16 : base_u16<T> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
 
-  simdutf_really_inline Mask operator==(const simd16<T> other) const { return vceqq_u16(*this, other); }
+    simdutf_really_inline base16()
+        : base_u16<T>()
+    {
+    }
+    simdutf_really_inline base16(const uint16x8_t _value)
+        : base_u16<T>(_value)
+    {
+    }
+    template<typename Pointer>
+    simdutf_really_inline base16(const Pointer* ptr)
+        : base16(vld1q_u16(ptr))
+    {
+    }
 
-  static const int SIZE = sizeof(base_u16<T>::value);
+    static const int SIZE = sizeof(base_u16<T>::value);
 
-  template<int N=1>
-  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-    return vextq_u18(prev_chunk, *this, 8 - N);
-  }
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return vextq_u18(prev_chunk, *this, 8 - N);
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool>: base16<bool> {
-  static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
-
-  simdutf_really_inline simd16<bool>() : base16() {}
-  simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+struct simd16<bool> : base16<bool> {
+    static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
 
+    simdutf_really_inline simd16<bool>()
+        : base16()
+    {
+    }
+    simdutf_really_inline simd16<bool>(const uint16x8_t _value)
+        : base16<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16<bool>(bool _value)
+        : base16<bool>(splat(_value))
+    {
+    }
 };
 
 template<typename T>
-struct base16_numeric: base16<T> {
-  static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
-  static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
-  static simdutf_really_inline simd16<T> load(const T values[8]) {
-    return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
-  }
-
-  simdutf_really_inline base16_numeric() : base16<T>() {}
-  simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
-  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
-  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
-  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+struct base16_numeric : base16<T> {
+    static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
+    static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
+    static simdutf_really_inline simd16<T> load(const T values[8])
+    {
+        return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
+    }
+
+    simdutf_really_inline base16_numeric()
+        : base16<T>()
+    {
+    }
+    simdutf_really_inline base16_numeric(const uint16x8_t _value)
+        : base16<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
+    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
+    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd16<T>*>(this);
+    }
+    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd16<T>*>(this);
+    }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+    simdutf_really_inline simd16()
+        : base16_numeric<int16_t>()
+    {
+    }
 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
-  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
+    simdutf_really_inline simd16(const uint16x8_t _value)
+        : base16_numeric<int16_t>(_value)
+    {
+    }
 #endif
-  simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
-  simdutf_really_inline operator simd16<uint16_t>() const;
-  simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
-  simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
-
-  simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
-  simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
-  // Order-sensitive comparisons
-  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-};
-
+    simdutf_really_inline simd16(const int16x8_t _value)
+        : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value))
+    {
+    }
 
+    // Splat constructor
+    simdutf_really_inline simd16(int16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const int16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const int16_t*>(values)))
+    {
+    }
+    simdutf_really_inline operator simd16<uint16_t>() const;
+    simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
+    simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
 
+    simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
+    simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
+    // Order-sensitive comparisons
+    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+};
 
 // Unsigned words
 template<>
-struct simd16<uint16_t>: base16_numeric<uint16_t>  {
-  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
-  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
-
-
-  simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
-  simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
-  // Saturated math
-  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
-
-  // Order-specific operations
-  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return  vcgtq_u16(*this, other); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
-
-  // Bit-specific operations
-  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
-
-  // logical operations
-  simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
-
-  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
-  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
-    return vqmovn_high_u16(vqmovn_u16(v0), v1);
-  }
-
-  // Change the endianness
-  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
-    #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    #else
-    const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-    #endif
-    return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
-  }
+struct simd16<uint16_t> : base16_numeric<uint16_t> {
+    simdutf_really_inline simd16()
+        : base16_numeric<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const uint16x8_t _value)
+        : base16_numeric<uint16_t>(_value)
+    {
+    }
+
+    // Splat constructor
+    simdutf_really_inline simd16(uint16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const uint16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
+    {
+    }
+
+    simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
+    simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
+    // Saturated math
+    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return vcgtq_u16(*this, other); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
+
+    // logical operations
+    simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
+
+    // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
+    {
+        return vqmovn_high_u16(vqmovn_u16(v0), v1);
+    }
+
+    // Change the endianness
+    simdutf_really_inline simd16<uint16_t> swap_bytes() const
+    {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+        const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+        return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
+    }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
 
-
-  template<typename T>
-  struct simd16x32 {
+template<typename T>
+struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -972,122 +1178,138 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd16x32(const T* ptr)
+        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T)) }
+    {
+    }
 
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
     }
 
-    simdutf_really_inline simd16<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    simdutf_really_inline simd16<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return reduce_or().is_ascii();
     }
 
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
-      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
-      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
-      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+        this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+        this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t bit_mask = make_uint8x16_t(
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      );
+        const uint8x16_t bit_mask = make_uint8x16_t(
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-      const uint8x16_t bit_mask = {
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      };
+        const uint8x16_t bit_mask = {
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        };
 #endif
-      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-      uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
-      uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
-      sum0 = vpaddq_u8(sum0, sum1);
-      sum0 = vpaddq_u8(sum0, sum0);
-      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
-
-    simdutf_really_inline void swap_bytes() {
-      this->chunks[0] = this->chunks[0].swap_bytes();
-      this->chunks[1] = this->chunks[1].swap_bytes();
-      this->chunks[2] = this->chunks[2].swap_bytes();
-      this->chunks[3] = this->chunks[3].swap_bytes();
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-    const simd16<T> mask = simd16<T>::splat(m);
-    return  simd16x32<bool>(
-      this->chunks[0] == mask,
-      this->chunks[1] == mask,
-      this->chunks[2] == mask,
-      this->chunks[3] == mask
-    ).to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t lteq(const T m) const {
-    const simd16<T> mask = simd16<T>::splat(m);
-    return  simd16x32<bool>(
-      this->chunks[0] <= mask,
-      this->chunks[1] <= mask,
-      this->chunks[2] <= mask,
-      this->chunks[3] <= mask
-    ).to_bitmask();
-  }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
-
-      return  simd16x32<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
-      return  simd16x32<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-
-  }; // struct simd16x32<T>
-  template<>
-  simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
-      const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
-      const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
-      simd16x32<uint16_t> x(
+        // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+        uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
+        uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
+        sum0 = vpaddq_u8(sum0, sum1);
+        sum0 = vpaddq_u8(sum0, sum0);
+        return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline void swap_bytes()
+    {
+        this->chunks[0] = this->chunks[0].swap_bytes();
+        this->chunks[1] = this->chunks[1].swap_bytes();
+        this->chunks[2] = this->chunks[2].swap_bytes();
+        this->chunks[3] = this->chunks[3].swap_bytes();
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
+
+        return simd16x32<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
+        return simd16x32<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+
+}; // struct simd16x32<T>
+template<>
+simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const
+{
+    const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+    const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+    simd16x32<uint16_t> x(
         simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
         simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
         simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
-        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
-      );
-      return  x.to_bitmask();
-    }
+        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)));
+    return x.to_bitmask();
+}
 /* end file src/simdutf/arm64/simd16-inl.h */
 } // namespace simd
 } // unnamed namespace
@@ -1097,7 +1319,7 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
 #endif // SIMDUTF_ARM64_SIMD_H
 /* end file src/simdutf/arm64/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/end.h
 /* begin file src/simdutf/arm64/end.h */
 /* end file src/simdutf/arm64/end.h */
 
@@ -1105,13 +1327,11 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
 
 #endif // SIMDUTF_ARM64_H
 /* end file src/simdutf/arm64.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake.h
 /* begin file src/simdutf/icelake.h */
 #ifndef SIMDUTF_ICELAKE_H
 #define SIMDUTF_ICELAKE_H
 
-
-
 #ifdef __has_include
 // How do we detect that a compiler supports vbmi2?
 // For sure if the following header is found, we are ok?
@@ -1133,18 +1353,15 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
 #define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
 #endif
 
-// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
 // https://github.com/simdutf/simdutf/issues/1247
-#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
-                                         SIMDUTF_HAS_AVX512DQ && \
-                                         SIMDUTF_HAS_AVX512VL && \
-                                           SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
+#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL && SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
 
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 #define SIMDUTF_TARGET_ICELAKE
 #else
-#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt")
+#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt")
 #endif
 
 namespace simdutf {
@@ -1152,20 +1369,17 @@ namespace icelake {
 } // namespace icelake
 } // namespace simdutf
 
-
-
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/intrinsics.h
 /* begin file src/simdutf/icelake/intrinsics.h */
 #ifndef SIMDUTF_ICELAKE_INTRINSICS_H
 #define SIMDUTF_ICELAKE_INTRINSICS_H
 
-
 #ifdef SIMDUTF_VISUAL_STUDIO
 // under clang within visual studio, this will include <x86intrin.h>
-#include <intrin.h>  // visual studio or clang
+#include <intrin.h> // visual studio or clang
 #include <immintrin.h>
 #else
 
@@ -1179,7 +1393,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
-
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -1209,15 +1422,14 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * <x86intrin.h>  (or <intrin.h>) before, so the headers
  * are fooled.
  */
-#include <bmiintrin.h>   // for _blsr_u64
-#include <bmi2intrin.h>  // for _pext_u64, _pdep_u64
+#include <bmiintrin.h> // for _blsr_u64
+#include <bmi2intrin.h> // for _pext_u64, _pdep_u64
 #include <lzcntintrin.h> // for  __lzcnt64
-#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
 #include <smmintrin.h>
 #include <tmmintrin.h>
 #include <avxintrin.h>
 #include <avx2intrin.h>
-#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
 // Important: we need the AVX-512 headers:
 #include <avx512fintrin.h>
 #include <avx512dqintrin.h>
@@ -1235,8 +1447,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif //  _blsr_u64
 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
 
-
-
 #if defined(__GNUC__) && !defined(__clang__)
 
 #if __GNUC__ == 8
@@ -1253,27 +1463,27 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /**
  * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
  */
-inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
-  return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
-                          uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
-                          uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
-                          uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
-                          uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
-                          uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
-                          uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
-                          uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
+inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63)
+{
+    return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
+        uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
+        uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
+        uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
+        uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
+        uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
+        uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
+        uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
 }
 #pragma GCC pop_options
 #endif // SIMDUTF_GCC8
 
 #endif // SIMDUTF_HASWELL_INTRINSICS_H
 /* end file src/simdutf/icelake/intrinsics.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/implementation.h
 /* begin file src/simdutf/icelake/implementation.h */
 #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
 #define SIMDUTF_ICELAKE_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace icelake {
 
@@ -1283,63 +1493,88 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation(
-      "icelake",
-      "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
-      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation(
+            "icelake",
+            "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
+            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace icelake
@@ -1351,7 +1586,7 @@ public:
 //
 // The rest need to be inside the region
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/begin.h
 /* begin file src/simdutf/icelake/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
 // #define SIMDUTF_IMPLEMENTATION icelake
@@ -1363,11 +1598,11 @@ SIMDUTF_TARGET_ICELAKE
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/icelake/begin.h */
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
 /* begin file src/simdutf/icelake/bitmanipulation.h */
 #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
 #define SIMDUTF_ICELAKE_BITMANIPULATION_H
@@ -1377,13 +1612,15 @@ namespace icelake {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num) {
-  return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num)
+{
+    return _popcnt64(input_num);
 }
 #endif
 
@@ -1393,7 +1630,7 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
 /* end file src/simdutf/icelake/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/end.h
 /* begin file src/simdutf/icelake/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 // nothing needed.
@@ -1401,18 +1638,15 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
 /* end file src/simdutf/icelake/end.h */
 
-
-
 #endif // SIMDUTF_IMPLEMENTATION_ICELAKE
 #endif // SIMDUTF_ICELAKE_H
 /* end file src/simdutf/icelake.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell.h
 /* begin file src/simdutf/haswell.h */
 #ifndef SIMDUTF_HASWELL_H
 #define SIMDUTF_HASWELL_H
@@ -1424,7 +1658,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #error "haswell.h must be included before fallback.h"
 #endif
 
-
 // Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
 // at runtime.
 #ifndef SIMDUTF_IMPLEMENTATION_HASWELL
@@ -1439,13 +1672,13 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif
 
 #endif
-// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
 // https://github.com/simdutf/simdutf/issues/1247
 #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
 
 #if SIMDUTF_IMPLEMENTATION_HASWELL
 
-#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,pclmul,lzcnt")
+#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
 
 namespace simdutf {
 /**
@@ -1458,12 +1691,11 @@ namespace haswell {
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/implementation.h
 /* begin file src/simdutf/haswell/implementation.h */
 #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
 #define SIMDUTF_HASWELL_IMPLEMENTATION_H
 
-
 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
 namespace simdutf {
 namespace haswell {
@@ -1472,64 +1704,88 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation(
-      "haswell",
-      "Intel/AMD AVX2",
-      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
-  ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation(
+            "haswell",
+            "Intel/AMD AVX2",
+            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace haswell
@@ -1537,15 +1793,14 @@ public:
 
 #endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
 /* end file src/simdutf/haswell/implementation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/intrinsics.h
 /* begin file src/simdutf/haswell/intrinsics.h */
 #ifndef SIMDUTF_HASWELL_INTRINSICS_H
 #define SIMDUTF_HASWELL_INTRINSICS_H
 
-
 #ifdef SIMDUTF_VISUAL_STUDIO
 // under clang within visual studio, this will include <x86intrin.h>
-#include <intrin.h>  // visual studio or clang
+#include <intrin.h> // visual studio or clang
 #else
 
 #if SIMDUTF_GCC11ORMORE
@@ -1558,7 +1813,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
-
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -1585,14 +1839,13 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * <x86intrin.h>  (or <intrin.h>) before, so the headers
  * are fooled.
  */
-#include <bmiintrin.h>   // for _blsr_u64
+#include <bmiintrin.h> // for _blsr_u64
 #include <lzcntintrin.h> // for  __lzcnt64
-#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
 #include <smmintrin.h>
 #include <tmmintrin.h>
 #include <avxintrin.h>
 #include <avx2intrin.h>
-#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
 // has it as a macro.
 #ifndef _blsr_u64
@@ -1607,7 +1860,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 //
 // The rest need to be inside the region
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/begin.h
 /* begin file src/simdutf/haswell/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
 // #define SIMDUTF_IMPLEMENTATION haswell
@@ -1619,11 +1872,11 @@ SIMDUTF_TARGET_HASWELL
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/haswell/begin.h */
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
 /* begin file src/simdutf/haswell/bitmanipulation.h */
 #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
 #define SIMDUTF_HASWELL_BITMANIPULATION_H
@@ -1633,13 +1886,15 @@ namespace haswell {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num) {
-  return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num)
+{
+    return _popcnt64(input_num);
 }
 #endif
 
@@ -1649,190 +1904,254 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_HASWELL_BITMANIPULATION_H
 /* end file src/simdutf/haswell/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/simd.h
 /* begin file src/simdutf/haswell/simd.h */
 #ifndef SIMDUTF_HASWELL_SIMD_H
 #define SIMDUTF_HASWELL_SIMD_H
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace simd {
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename Child>
-  struct base {
+// Forward-declared so they can be used by splat and friends.
+template<typename Child>
+struct base {
     __m256i value;
 
     // Zero constructor
-    simdutf_really_inline base() : value{__m256i()} {}
+    simdutf_really_inline base()
+        : value { __m256i() }
+    {
+    }
 
     // Conversion from SIMD register
-    simdutf_really_inline base(const __m256i _value) : value(_value) {}
+    simdutf_really_inline base(const __m256i _value)
+        : value(_value)
+    {
+    }
     // Conversion to SIMD register
     simdutf_really_inline operator const __m256i&() const { return this->value; }
     simdutf_really_inline operator __m256i&() { return this->value; }
-    template <endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
-      __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
-      if (big_endian) {
-        const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-        first = _mm256_shuffle_epi8(first, swap);
-        second = _mm256_shuffle_epi8(second, swap);
-      }
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
+    template<endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
+        __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1));
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            first = _mm256_shuffle_epi8(first, swap);
+            second = _mm256_shuffle_epi8(second, swap);
+        }
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), first);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 16), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this, 8))));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1)));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this, 1), 8)));
     }
     // Bit operations
     simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
     simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
     simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
-    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
-  };
+    simdutf_really_inline Child& operator|=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator&=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator^=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
+};
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename T>
-  struct simd8;
+// Forward-declared so they can be used by splat and friends.
+template<typename T>
+struct simd8;
 
-  template<typename T, typename Mask=simd8<bool>>
-  struct base8: base<simd8<T>> {
+template<typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
     typedef uint32_t bitmask_t;
     typedef uint64_t bitmask2_t;
 
-    simdutf_really_inline base8() : base<simd8<T>>() {}
-    simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
-    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
-    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
-    simdutf_really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
+    simdutf_really_inline base8()
+        : base<simd8<T>>()
+    {
+    }
+    simdutf_really_inline base8(const __m256i _value)
+        : base<simd8<T>>(_value)
+    {
+    }
+    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this, 0); }
+    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this, 31); }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
 
     static const int SIZE = sizeof(base<T>::value);
 
-    template<int N=1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+    {
+        return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
     }
-  };
+};
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base8<bool> {
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd8<bool> : base8<bool> {
     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8<bool>() : base8() {}
-    simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
+    simdutf_really_inline simd8<bool>()
+        : base8()
+    {
+    }
+    simdutf_really_inline simd8<bool>(const __m256i _value)
+        : base8<bool>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+    simdutf_really_inline simd8<bool>(bool _value)
+        : base8<bool>(splat(_value))
+    {
+    }
 
     simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
     simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
     simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
     simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
-  };
+};
 
-  template<typename T>
-  struct base8_numeric: base8<T> {
+template<typename T>
+struct base8_numeric : base8<T> {
     static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
     static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
-    static simdutf_really_inline simd8<T> load(const T values[32]) {
-      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+    static simdutf_really_inline simd8<T> load(const T values[32])
+    {
+        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(values));
     }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     static simdutf_really_inline simd8<T> repeat_16(
-      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
-      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
-    ) {
-      return simd8<T>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+        T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+    {
+        return simd8<T>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
-    simdutf_really_inline base8_numeric() : base8<T>() {}
-    simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
+    simdutf_really_inline base8_numeric()
+        : base8<T>()
+    {
+    }
+    simdutf_really_inline base8_numeric(const __m256i _value)
+        : base8<T>(_value)
+    {
+    }
 
     // Store to array
-    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), *this); }
 
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd8<T>*>(this);
+    }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd8<T>*>(this);
+    }
 
     // Override to distinguish from bool version
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return _mm256_shuffle_epi8(lookup_table, *this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return _mm256_shuffle_epi8(lookup_table, *this);
     }
 
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
-  };
-
-
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
-    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
+    }
+};
+
+// Signed bytes
+template<>
+struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<int8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m256i _value)
+        : base8_numeric<int8_t>(_value)
+    {
+    }
 
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const int8_t values[32])
+        : simd8(load(values))
+    {
+    }
     simdutf_really_inline operator simd8<uint8_t>() const;
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
-      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
-      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
-    ) : simd8(_mm256_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15,
-      v16,v17,v18,v19,v20,v21,v22,v23,
-      v24,v25,v26,v27,v28,v29,v30,v31
-    )) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+        int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+        int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31)
+        : simd8(_mm256_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v16, v17, v18, v19, v20, v21, v22, v23,
+            v24, v25, v26, v27, v28, v29, v30, v31))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
     // Order-sensitive comparisons
@@ -1840,43 +2159,54 @@ namespace simd {
     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
-  };
+};
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base8_numeric<uint8_t> {
-    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
-    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
+// Unsigned bytes
+template<>
+struct simd8<uint8_t> : base8_numeric<uint8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<uint8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m256i _value)
+        : base8_numeric<uint8_t>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint8_t values[32])
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
-      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
-      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
-    ) : simd8(_mm256_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15,
-      v16,v17,v18,v19,v20,v21,v22,v23,
-      v24,v25,v26,v27,v28,v29,v30,v31
-    )) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+        uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31)
+        : simd8(_mm256_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v16, v17, v18, v19, v20, v21, v22, v23,
+            v24, v25, v26, v27, v28, v29, v30, v31))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+    {
+        return simd8<uint8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
-
     // Saturated math
     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
@@ -1910,13 +2240,12 @@ namespace simd {
     // Get one of the bits and make a bitmask out of it.
     // e.g. value.get_bit<7>() gets the high bit
     template<int N>
-    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
-  };
-  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
-
+    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7 - N)); }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
 
-  template<typename T>
-  struct simd8x64 {
+template<typename T>
+struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -1925,297 +2254,383 @@ namespace simd {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
-    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
+        : chunks { chunk0, chunk1 }
+    {
+    }
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)) }
+    {
+    }
 
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
-      uint64_t r_hi =                       this->chunks[1].to_bitmask();
-      return r_lo | (r_hi << 32);
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r_hi = this->chunks[1].to_bitmask();
+        return r_lo | (r_hi << 32);
     }
 
-    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      return *this;
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        return *this;
     }
 
-    simdutf_really_inline simd8<T> reduce_or() const {
-      return this->chunks[0] | this->chunks[1];
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return this->chunks[0] | this->chunks[1];
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
     }
 
-    template <endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
+    template<endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
     }
 
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
     }
 
-    simdutf_really_inline simd8x64<T> bit_or(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return simd8x64<T>(
-        this->chunks[0] | mask,
-        this->chunks[1] | mask
-      );
+    simdutf_really_inline simd8x64<T> bit_or(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<T>(
+            this->chunks[0] | mask,
+            this->chunks[1] | mask);
     }
 
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
-      return  simd8x64<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1]
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
+    {
+        return simd8x64<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1])
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
 
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-      return  simd8x64<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
-      ).to_bitmask();
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+        return simd8x64<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask)
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask)
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
-        (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
-      ).to_bitmask();
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+            (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
+            .to_bitmask();
     }
-  }; // struct simd8x64<T>
+}; // struct simd8x64<T>
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/simd16-inl.h
 /* begin file src/simdutf/haswell/simd16-inl.h */
 #ifdef __GNUC__
 #if __GNUC__ < 8
 #define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
-#define _mm256_setr_m128i(xmm2, xmm1)  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
+#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
 #endif
 #endif
 
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask=simd16<bool>>
-struct base16: base<simd16<T>> {
-  using bitmask_type = uint32_t;
-
-  simdutf_really_inline base16() : base<simd16<T>>() {}
-  simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
-  template <typename Pointer>
-  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
+template<typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+    using bitmask_type = uint32_t;
 
-  simdutf_really_inline Mask operator==(const simd16<T> other) const { return _mm256_cmpeq_epi16(*this, other); }
+    simdutf_really_inline base16()
+        : base<simd16<T>>()
+    {
+    }
+    simdutf_really_inline base16(const __m256i _value)
+        : base<simd16<T>>(_value)
+    {
+    }
+    template<typename Pointer>
+    simdutf_really_inline base16(const Pointer* ptr)
+        : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)))
+    {
+    }
+    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
 
-  /// the size of vector in bytes
-  static const int SIZE = sizeof(base<simd16<T>>::value);
+    /// the size of vector in bytes
+    static const int SIZE = sizeof(base<simd16<T>>::value);
 
-  /// the number of elements of type T a vector can hold
-  static const int ELEMENTS = SIZE / sizeof(T);
+    /// the number of elements of type T a vector can hold
+    static const int ELEMENTS = SIZE / sizeof(T);
 
-  template<int N=1>
-  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-    return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
-  }
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool>: base16<bool> {
-  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
+struct simd16<bool> : base16<bool> {
+    static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
 
-  simdutf_really_inline simd16<bool>() : base16() {}
-  simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+    simdutf_really_inline simd16<bool>()
+        : base16()
+    {
+    }
+    simdutf_really_inline simd16<bool>(const __m256i _value)
+        : base16<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16<bool>(bool _value)
+        : base16<bool>(splat(_value))
+    {
+    }
 
-  simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
-  simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
-  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+    simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
+    simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+    simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
 };
 
 template<typename T>
-struct base16_numeric: base16<T> {
-  static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
-  static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
-  static simdutf_really_inline simd16<T> load(const T values[8]) {
-    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
-  }
-
-  simdutf_really_inline base16_numeric() : base16<T>() {}
-  simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
-  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
-  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
-  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+struct base16_numeric : base16<T> {
+    static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
+    static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
+    static simdutf_really_inline simd16<T> load(const T values[8])
+    {
+        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(values));
+    }
+
+    simdutf_really_inline base16_numeric()
+        : base16<T>()
+    {
+    }
+    simdutf_really_inline base16_numeric(const __m256i _value)
+        : base16<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
+    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
+    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd16<T>*>(this);
+    }
+    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd16<T>*>(this);
+    }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
-  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
-  // Order-sensitive comparisons
-  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
-  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
+    simdutf_really_inline simd16()
+        : base16_numeric<int16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m256i _value)
+        : base16_numeric<int16_t>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16(int16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const int16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const int16_t*>(values)))
+    {
+    }
+    // Order-sensitive comparisons
+    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
+    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
 };
 
 // Unsigned words
 template<>
-struct simd16<uint16_t>: base16_numeric<uint16_t>  {
-  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
-  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
-
-  // Saturated math
-  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
-
-  // Order-specific operations
-  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-
-  // Bit-specific operations
-  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
-
-  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
-  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
-  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
-  // Get one of the bits and make a bitmask out of it.
-  // e.g. value.get_bit<7>() gets the high bit
-  template<int N>
-  simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
-
-  // Change the endianness
-  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
-    const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-    return _mm256_shuffle_epi8(*this, swap);
-  }
-
-  // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
-  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
-    // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
-    //       we have to shuffle lanes in order to produce bytes in the
-    //       correct order.
-
-    // get the 0th lanes
-    const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
-    const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
-
-    // get the 1st lanes
-    const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
-    const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
-
-    // build new vectors (shuffle lanes)
-    const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
-    const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
-
-    // pack words in linear order from v0 and v1
-    return _mm256_packus_epi16(t0, t1);
-  }
-};
+struct simd16<uint16_t> : base16_numeric<uint16_t> {
+    simdutf_really_inline simd16()
+        : base16_numeric<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m256i _value)
+        : base16_numeric<uint16_t>(_value)
+    {
+    }
+
+    // Splat constructor
+    simdutf_really_inline simd16(uint16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const uint16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
+    {
+    }
+
+    // Saturated math
+    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+    simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15 - N)); }
+
+    // Change the endianness
+    simdutf_really_inline simd16<uint16_t> swap_bytes() const
+    {
+        const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+            17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+        return _mm256_shuffle_epi8(*this, swap);
+    }
+
+    // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
+    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
+    {
+        // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
+        //       we have to shuffle lanes in order to produce bytes in the
+        //       correct order.
 
+        // get the 0th lanes
+        const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
+        const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
+
+        // get the 1st lanes
+        const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
+        const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
+
+        // build new vectors (shuffle lanes)
+        const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
+        const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
+
+        // pack words in linear order from v0 and v1
+        return _mm256_packus_epi16(t0, t1);
+    }
+};
 
-  template<typename T>
-  struct simd16x32 {
+template<typename T>
+struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -2224,96 +2639,114 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
-    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1)
+        : chunks { chunk0, chunk1 }
+    {
+    }
+    simdutf_really_inline simd16x32(const T* ptr)
+        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)) }
+    {
+    }
 
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
-      uint64_t r_hi =                       this->chunks[1].to_bitmask();
-      return r_lo | (r_hi << 32);
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r_hi = this->chunks[1].to_bitmask();
+        return r_lo | (r_hi << 32);
     }
 
-    simdutf_really_inline simd16<T> reduce_or() const {
-      return this->chunks[0] | this->chunks[1];
+    simdutf_really_inline simd16<T> reduce_or() const
+    {
+        return this->chunks[0] | this->chunks[1];
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
     }
 
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
-      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
     }
 
-    simdutf_really_inline simd16x32<T> bit_or(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return simd16x32<T>(
-        this->chunks[0] | mask,
-        this->chunks[1] | mask
-      );
+    simdutf_really_inline simd16x32<T> bit_or(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<T>(
+            this->chunks[0] | mask,
+            this->chunks[1] | mask);
     }
 
-    simdutf_really_inline void swap_bytes() {
-      this->chunks[0] = this->chunks[0].swap_bytes();
-      this->chunks[1] = this->chunks[1].swap_bytes();
+    simdutf_really_inline void swap_bytes()
+    {
+        this->chunks[0] = this->chunks[0].swap_bytes();
+        this->chunks[1] = this->chunks[1].swap_bytes();
     }
 
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
-      return  simd16x32<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1]
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t>& other) const
+    {
+        return simd16x32<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1])
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
 
-      return  simd16x32<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
-      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
-      return simd16x32<bool>(
-        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
-      ).to_bitmask();
+        return simd16x32<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+        const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+        return simd16x32<bool>(
+            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask)
+            .to_bitmask();
     }
-  }; // struct simd16x32<T>
+}; // struct simd16x32<T>
 /* end file src/simdutf/haswell/simd16-inl.h */
 
 } // namespace simd
@@ -2325,7 +2758,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
 #endif // SIMDUTF_HASWELL_SIMD_H
 /* end file src/simdutf/haswell/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/end.h
 /* begin file src/simdutf/haswell/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
 // nothing needed.
@@ -2333,7 +2766,6 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -2342,7 +2774,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif // SIMDUTF_IMPLEMENTATION_HASWELL
 #endif // SIMDUTF_HASWELL_COMMON_H
 /* end file src/simdutf/haswell.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere.h
 /* begin file src/simdutf/westmere.h */
 #ifndef SIMDUTF_WESTMERE_H
 #define SIMDUTF_WESTMERE_H
@@ -2351,7 +2783,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #error "westmere.h must be included before fallback.h"
 #endif
 
-
 // Default Westmere to on if this is x86-64, unless we'll always select Haswell.
 #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
 //
@@ -2366,11 +2797,11 @@ SIMDUTF_POP_DISABLE_WARNINGS
 
 #endif
 
-#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__ && __PCLMUL__)
+#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
 
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
 
-#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,pclmul")
+#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
 
 namespace simdutf {
 /**
@@ -2383,12 +2814,11 @@ namespace westmere {
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/implementation.h
 /* begin file src/simdutf/westmere/implementation.h */
 #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
 #define SIMDUTF_WESTMERE_IMPLEMENTATION_H
 
-
 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
 namespace simdutf {
 namespace westmere {
@@ -2399,60 +2829,85 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace westmere
@@ -2460,7 +2915,7 @@ public:
 
 #endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
 /* end file src/simdutf/westmere/implementation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/intrinsics.h
 /* begin file src/simdutf/westmere/intrinsics.h */
 #ifndef SIMDUTF_WESTMERE_INTRINSICS_H
 #define SIMDUTF_WESTMERE_INTRINSICS_H
@@ -2480,7 +2935,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
-
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -2488,7 +2942,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 
 #endif // SIMDUTF_VISUAL_STUDIO
 
-
 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
 /**
  * You are not supposed, normally, to include these
@@ -2498,19 +2951,16 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * only get included *if* the corresponding features are detected
  * from macros:
  */
-#include <smmintrin.h>  // for _mm_alignr_epi8
-#include <wmmintrin.h>  // for  _mm_clmulepi64_si128
+#include <smmintrin.h> // for _mm_alignr_epi8
 #endif
 
-
-
 #endif // SIMDUTF_WESTMERE_INTRINSICS_H
 /* end file src/simdutf/westmere/intrinsics.h */
 
 //
 // The rest need to be inside the region
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/begin.h
 /* begin file src/simdutf/westmere/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
 // #define SIMDUTF_IMPLEMENTATION westmere
@@ -2523,7 +2973,7 @@ SIMDUTF_TARGET_WESTMERE
 /* end file src/simdutf/westmere/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
 /* begin file src/simdutf/westmere/bitmanipulation.h */
 #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
 #define SIMDUTF_WESTMERE_BITMANIPULATION_H
@@ -2533,13 +2983,15 @@ namespace westmere {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num) {
-  return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num)
+{
+    return _popcnt64(input_num);
 }
 #endif
 
@@ -2549,7 +3001,7 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
 /* end file src/simdutf/westmere/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/simd.h
 /* begin file src/simdutf/westmere/simd.h */
 #ifndef SIMDUTF_WESTMERE_SIMD_H
 #define SIMDUTF_WESTMERE_SIMD_H
@@ -2559,110 +3011,155 @@ namespace westmere {
 namespace {
 namespace simd {
 
-  template<typename Child>
-  struct base {
+template<typename Child>
+struct base {
     __m128i value;
 
     // Zero constructor
-    simdutf_really_inline base() : value{__m128i()} {}
+    simdutf_really_inline base()
+        : value { __m128i() }
+    {
+    }
 
     // Conversion from SIMD register
-    simdutf_really_inline base(const __m128i _value) : value(_value) {}
+    simdutf_really_inline base(const __m128i _value)
+        : value(_value)
+    {
+    }
     // Conversion to SIMD register
     simdutf_really_inline operator const __m128i&() const { return this->value; }
     simdutf_really_inline operator __m128i&() { return this->value; }
-    template <endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
-      __m128i first = _mm_cvtepu8_epi16(*this);
-      __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        first = _mm_shuffle_epi8(first, swap);
-        second = _mm_shuffle_epi8(second, swap);
-      }
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
+    template<endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* p) const
+    {
+        __m128i first = _mm_cvtepu8_epi16(*this);
+        __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8));
+        if (big_endian) {
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            first = _mm_shuffle_epi8(first, swap);
+            second = _mm_shuffle_epi8(second, swap);
+        }
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), first);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 8), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* p) const
+    {
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), _mm_cvtepu8_epi32(*this));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 4), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 4)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 8), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 8)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 12), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 12)));
     }
     // Bit operations
     simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
     simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
     simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
-    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
-  };
+    simdutf_really_inline Child& operator|=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator&=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator^=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
+};
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename T>
-  struct simd8;
+// Forward-declared so they can be used by splat and friends.
+template<typename T>
+struct simd8;
 
-  template<typename T, typename Mask=simd8<bool>>
-  struct base8: base<simd8<T>> {
+template<typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
     typedef uint16_t bitmask_t;
     typedef uint32_t bitmask2_t;
 
-    simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
-    simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
-    simdutf_really_inline base8() : base<simd8<T>>() {}
-    simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+    simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); }
+    simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); }
+    simdutf_really_inline base8()
+        : base<simd8<T>>()
+    {
+    }
+    simdutf_really_inline base8(const __m128i _value)
+        : base<simd8<T>>(_value)
+    {
+    }
 
-    simdutf_really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
 
     static const int SIZE = sizeof(base<simd8<T>>::value);
 
-    template<int N=1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+    {
+        return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
     }
-  };
+};
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base8<bool> {
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd8<bool> : base8<bool> {
     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8<bool>() : base8() {}
-    simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
+    simdutf_really_inline simd8<bool>()
+        : base8()
+    {
+    }
+    simdutf_really_inline simd8<bool>(const __m128i _value)
+        : base8<bool>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+    simdutf_really_inline simd8<bool>(bool _value)
+        : base8<bool>(splat(_value))
+    {
+    }
 
     simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
     simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
     simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
     simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
-  };
+};
 
-  template<typename T>
-  struct base8_numeric: base8<T> {
+template<typename T>
+struct base8_numeric : base8<T> {
     static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
     static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
-    static simdutf_really_inline simd8<T> load(const T values[16]) {
-      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    static simdutf_really_inline simd8<T> load(const T values[16])
+    {
+        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
     }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     static simdutf_really_inline simd8<T> repeat_16(
-      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
-      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
-    ) {
-      return simd8<T>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+        T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+    {
+        return simd8<T>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
-    simdutf_really_inline base8_numeric() : base8<T>() {}
-    simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
+    simdutf_really_inline base8_numeric()
+        : base8<T>()
+    {
+    }
+    simdutf_really_inline base8_numeric(const __m128i _value)
+        : base8<T>(_value)
+    {
+    }
 
     // Store to array
-    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), *this); }
 
     // Override to distinguish from bool version
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
@@ -2670,56 +3167,77 @@ namespace simd {
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd8<T>*>(this);
+    }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd8<T>*>(this);
+    }
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return _mm_shuffle_epi8(lookup_table, *this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return _mm_shuffle_epi8(lookup_table, *this);
     }
 
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
-  };
-
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
-    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
+    }
+};
+
+// Signed bytes
+template<>
+struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<int8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<int8_t>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const int8_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(_mm_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8(_mm_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
     simdutf_really_inline operator simd8<uint8_t>() const;
     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
@@ -2729,35 +3247,47 @@ namespace simd {
     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
-  };
+};
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base8_numeric<uint8_t>  {
-    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
-    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
+// Unsigned bytes
+template<>
+struct simd8<uint8_t> : base8_numeric<uint8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<uint8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<uint8_t>(_value)
+    {
+    }
 
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint8_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(_mm_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        : simd8(_mm_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+    {
+        return simd8<uint8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
     // Saturated math
@@ -2794,30 +3324,44 @@ namespace simd {
     // Get one of the bits and make a bitmask out of it.
     // e.g. value.get_bit<7>() gets the high bit
     template<int N>
-    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
-  };
-  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
+    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint16_t>: base<uint16_t> {
+// Unsigned bytes
+template<>
+struct simd8<uint16_t> : base<uint16_t> {
     static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
-    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
-      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8])
+    {
+        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
     }
 
-    simdutf_really_inline simd8() : base<uint16_t>() {}
-    simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
+    simdutf_really_inline simd8()
+        : base<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base<uint16_t>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint16_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint16_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      uint16_t v0,  uint16_t v1,  uint16_t v2,  uint16_t v3,  uint16_t v4,  uint16_t v5,  uint16_t v6,  uint16_t v7
-    ) : simd8(_mm_setr_epi16(
-      v0, v1, v2, v3, v4, v5, v6, v7
-    )) {}
+        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+        : simd8(_mm_setr_epi16(
+            v0, v1, v2, v3, v4, v5, v6, v7))
+    {
+    }
 
     // Saturated math
     simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
@@ -2844,9 +3388,9 @@ namespace simd {
     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-     };
-  template<typename T>
-  struct simd8x64 {
+};
+template<typename T>
+struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -2855,303 +3399,395 @@ namespace simd {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
-
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
+    {
+    }
+
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    }
+
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        this->chunks[2] |= other.chunks[2];
+        this->chunks[3] |= other.chunks[3];
+        return *this;
+    }
+
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
+    }
+
+    template<endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 3);
+    }
+
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r1 = this->chunks[1].to_bitmask();
+        uint64_t r2 = this->chunks[2].to_bitmask();
+        uint64_t r3 = this->chunks[3].to_bitmask();
+        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
+    {
+        return simd8x64<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1],
+            this->chunks[2] == other.chunks[2],
+            this->chunks[3] == other.chunks[3])
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low - 1);
+        const simd8<T> mask_high = simd8<T>::splat(high + 1);
+        return simd8x64<bool>(
+            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+            (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+            (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask,
+            this->chunks[2] > mask,
+            this->chunks[3] > mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask,
+            this->chunks[2] >= mask,
+            this->chunks[3] >= mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
+            simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
+            simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
+            simd8<uint8_t>(__m128i(this->chunks[3])) >= mask)
+            .to_bitmask();
     }
+}; // struct simd8x64<T>
 
-    simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      this->chunks[2] |= other.chunks[2];
-      this->chunks[3] |= other.chunks[3];
-      return *this;
-    }
-
-    simdutf_really_inline simd8<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
-    }
-
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
-    }
-
-    template <endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
-    }
-
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
-    }
-
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
-      uint64_t r1 =          this->chunks[1].to_bitmask() ;
-      uint64_t r2 =          this->chunks[2].to_bitmask() ;
-      uint64_t r3 =          this->chunks[3].to_bitmask() ;
-      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask,
-        this->chunks[2] == mask,
-        this->chunks[3] == mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
-      return  simd8x64<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1],
-        this->chunks[2] == other.chunks[2],
-        this->chunks[3] == other.chunks[3]
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask,
-        this->chunks[2] <= mask,
-        this->chunks[3] <= mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low-1);
-      const simd8<T> mask_high = simd8<T>::splat(high+1);
-      return simd8x64<bool>(
-        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
-        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
-        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask,
-        this->chunks[2] > mask,
-        this->chunks[3] > mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask,
-        this->chunks[2] >= mask,
-        this->chunks[3] >= mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
-        simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
-        simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
-        simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
-      ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
-
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/simd16-inl.h
 /* begin file src/simdutf/westmere/simd16-inl.h */
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask=simd16<bool>>
-struct base16: base<simd16<T>> {
-  typedef uint16_t bitmask_t;
-  typedef uint32_t bitmask2_t;
+template<typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
 
-  simdutf_really_inline base16() : base<simd16<T>>() {}
-  simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
-  template <typename Pointer>
-  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
+    simdutf_really_inline base16()
+        : base<simd16<T>>()
+    {
+    }
+    simdutf_really_inline base16(const __m128i _value)
+        : base<simd16<T>>(_value)
+    {
+    }
+    template<typename Pointer>
+    simdutf_really_inline base16(const Pointer* ptr)
+        : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)))
+    {
+    }
 
-  simdutf_really_inline Mask operator==(const simd16<T> other) const { return _mm_cmpeq_epi16(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
 
-  static const int SIZE = sizeof(base<simd16<T>>::value);
+    static const int SIZE = sizeof(base<simd16<T>>::value);
 
-  template<int N=1>
-  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-    return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
-  }
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool>: base16<bool> {
-  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
+struct simd16<bool> : base16<bool> {
+    static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
 
-  simdutf_really_inline simd16<bool>() : base16() {}
-  simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+    simdutf_really_inline simd16<bool>()
+        : base16()
+    {
+    }
+    simdutf_really_inline simd16<bool>(const __m128i _value)
+        : base16<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16<bool>(bool _value)
+        : base16<bool>(splat(_value))
+    {
+    }
 
-  simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
-  simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
-  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+    simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+    simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+    simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
 };
 
 template<typename T>
-struct base16_numeric: base16<T> {
-  static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
-  static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
-  static simdutf_really_inline simd16<T> load(const T values[8]) {
-    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
-  }
-
-  simdutf_really_inline base16_numeric() : base16<T>() {}
-  simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
-  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
-  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
-  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+struct base16_numeric : base16<T> {
+    static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
+    static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
+    static simdutf_really_inline simd16<T> load(const T values[8])
+    {
+        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
+    }
+
+    simdutf_really_inline base16_numeric()
+        : base16<T>()
+    {
+    }
+    simdutf_really_inline base16_numeric(const __m128i _value)
+        : base16<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
+    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
+    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd16<T>*>(this);
+    }
+    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd16<T>*>(this);
+    }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
-  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
-  // Member-by-member initialization
-  simdutf_really_inline simd16(
-    int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
-    : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
-  simdutf_really_inline operator simd16<uint16_t>() const;
-
-  // Order-sensitive comparisons
-  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
-  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
+    simdutf_really_inline simd16()
+        : base16_numeric<int16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m128i _value)
+        : base16_numeric<int16_t>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16(int16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const int16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const int16_t*>(values)))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline simd16(
+        int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
+        : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7))
+    {
+    }
+    simdutf_really_inline operator simd16<uint16_t>() const;
+
+    // Order-sensitive comparisons
+    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
+    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
 };
 
 // Unsigned words
 template<>
-struct simd16<uint16_t>: base16_numeric<uint16_t>  {
-  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
-  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
-  // Member-by-member initialization
-  simdutf_really_inline simd16(
-    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
-  : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  simdutf_really_inline static simd16<uint16_t> repeat_16(
-    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
-  ) {
-    return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
-  }
-
-  // Saturated math
-  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
-
-  // Order-specific operations
-  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-
-  // Bit-specific operations
-  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
-
-  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
-  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
-  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
-  // Get one of the bits and make a bitmask out of it.
-  // e.g. value.get_bit<7>() gets the high bit
-  template<int N>
-  simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
-
-  // Change the endianness
-  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
-    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    return _mm_shuffle_epi8(*this, swap);
-  }
+struct simd16<uint16_t> : base16_numeric<uint16_t> {
+    simdutf_really_inline simd16()
+        : base16_numeric<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m128i _value)
+        : base16_numeric<uint16_t>(_value)
+    {
+    }
 
-  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
-  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
-    return _mm_packus_epi16(v0, v1);
-  }
+    // Splat constructor
+    simdutf_really_inline simd16(uint16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const uint16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline simd16(
+        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+        : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7))
+    {
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd16<uint16_t> repeat_16(
+        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+    {
+        return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    // Saturated math
+    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+    simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
+
+    // Change the endianness
+    simdutf_really_inline simd16<uint16_t> swap_bytes() const
+    {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        return _mm_shuffle_epi8(*this, swap);
+    }
+
+    // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
+    {
+        return _mm_packus_epi16(v0, v1);
+    }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
 
 template<typename T>
-  struct simd16x32 {
+struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -3160,106 +3796,124 @@ template<typename T>
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
-
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
-    }
-
-    simdutf_really_inline simd16<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
-    }
-
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
-    }
-
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
-      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
-      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
-      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
-    }
-
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
-      uint64_t r1 =          this->chunks[1].to_bitmask() ;
-      uint64_t r2 =          this->chunks[2].to_bitmask() ;
-      uint64_t r3 =          this->chunks[3].to_bitmask() ;
-      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
-
-    simdutf_really_inline void swap_bytes() {
-      this->chunks[0] = this->chunks[0].swap_bytes();
-      this->chunks[1] = this->chunks[1].swap_bytes();
-      this->chunks[2] = this->chunks[2].swap_bytes();
-      this->chunks[3] = this->chunks[3].swap_bytes();
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask,
-        this->chunks[2] == mask,
-        this->chunks[3] == mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
-      return  simd16x32<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1],
-        this->chunks[2] == other.chunks[2],
-        this->chunks[3] == other.chunks[3]
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask,
-        this->chunks[2] <= mask,
-        this->chunks[3] <= mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
-
-      return  simd16x32<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
-      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
-      return simd16x32<bool>(
-        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
-        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
-        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-  }; // struct simd16x32<T>
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd16x32(const T* ptr)
+        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T)) }
+    {
+    }
+
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
+    }
+
+    simdutf_really_inline simd16<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+        this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+        this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r1 = this->chunks[1].to_bitmask();
+        uint64_t r2 = this->chunks[2].to_bitmask();
+        uint64_t r3 = this->chunks[3].to_bitmask();
+        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline void swap_bytes()
+    {
+        this->chunks[0] = this->chunks[0].swap_bytes();
+        this->chunks[1] = this->chunks[1].swap_bytes();
+        this->chunks[2] = this->chunks[2].swap_bytes();
+        this->chunks[3] = this->chunks[3].swap_bytes();
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t>& other) const
+    {
+        return simd16x32<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1],
+            this->chunks[2] == other.chunks[2],
+            this->chunks[3] == other.chunks[3])
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
+
+        return simd16x32<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+        const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+        return simd16x32<bool>(
+            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+            (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+            (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+}; // struct simd16x32<T>
 /* end file src/simdutf/westmere/simd16-inl.h */
 
 } // namespace simd
@@ -3270,7 +3924,7 @@ template<typename T>
 #endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
 /* end file src/simdutf/westmere/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/end.h
 /* begin file src/simdutf/westmere/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
@@ -3283,7 +3937,7 @@ SIMDUTF_UNTARGET_REGION
 #endif // SIMDUTF_IMPLEMENTATION_WESTMERE
 #endif // SIMDUTF_WESTMERE_COMMON_H
 /* end file src/simdutf/westmere.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64.h
 /* begin file src/simdutf/ppc64.h */
 #ifndef SIMDUTF_PPC64_H
 #define SIMDUTF_PPC64_H
@@ -3292,13 +3946,10 @@ SIMDUTF_UNTARGET_REGION
 #error "ppc64.h must be included before fallback.h"
 #endif
 
-
 #ifndef SIMDUTF_IMPLEMENTATION_PPC64
 #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
 #endif
-#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
-
-
+#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64&& SIMDUTF_IS_PPC64
 
 #if SIMDUTF_IMPLEMENTATION_PPC64
 
@@ -3310,12 +3961,11 @@ namespace ppc64 {
 } // namespace ppc64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/implementation.h
 /* begin file src/simdutf/ppc64/implementation.h */
 #ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
 #define SIMDUTF_PPC64_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace ppc64 {
 
@@ -3325,62 +3975,64 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation()
-      : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
-                                 internal::instruction_set::ALTIVEC) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
+            internal::instruction_set::ALTIVEC)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
 };
 
 } // namespace ppc64
@@ -3389,19 +4041,18 @@ public:
 #endif // SIMDUTF_PPC64_IMPLEMENTATION_H
 /* end file src/simdutf/ppc64/implementation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/begin.h
 /* begin file src/simdutf/ppc64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
 // #define SIMDUTF_IMPLEMENTATION ppc64
 /* end file src/simdutf/ppc64/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/intrinsics.h
 /* begin file src/simdutf/ppc64/intrinsics.h */
 #ifndef SIMDUTF_PPC64_INTRINSICS_H
 #define SIMDUTF_PPC64_INTRINSICS_H
 
-
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <altivec.h>
@@ -3417,7 +4068,7 @@ public:
 
 #endif //  SIMDUTF_PPC64_INTRINSICS_H
 /* end file src/simdutf/ppc64/intrinsics.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
 /* begin file src/simdutf/ppc64/bitmanipulation.h */
 #ifndef SIMDUTF_PPC64_BITMANIPULATION_H
 #define SIMDUTF_PPC64_BITMANIPULATION_H
@@ -3427,13 +4078,15 @@ namespace ppc64 {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline int count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num); // Visual Studio wants two underscores
+simdutf_really_inline int count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline int count_ones(uint64_t input_num) {
-  return __builtin_popcountll(input_num);
+simdutf_really_inline int count_ones(uint64_t input_num)
+{
+    return __builtin_popcountll(input_num);
 }
 #endif
 
@@ -3443,7 +4096,7 @@ simdutf_really_inline int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_PPC64_BITMANIPULATION_H
 /* end file src/simdutf/ppc64/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/simd.h
 /* begin file src/simdutf/ppc64/simd.h */
 #ifndef SIMDUTF_PPC64_SIMD_H
 #define SIMDUTF_PPC64_SIMD_H
@@ -3457,474 +4110,592 @@ namespace simd {
 
 using __m128i = __vector unsigned char;
 
-template <typename Child> struct base {
-  __m128i value;
-
-  // Zero constructor
-  simdutf_really_inline base() : value{__m128i()} {}
-
-  // Conversion from SIMD register
-  simdutf_really_inline base(const __m128i _value) : value(_value) {}
-
-  // Conversion to SIMD register
-  simdutf_really_inline operator const __m128i &() const {
-    return this->value;
-  }
-  simdutf_really_inline operator __m128i &() { return this->value; }
-
-  // Bit operations
-  simdutf_really_inline Child operator|(const Child other) const {
-    return vec_or(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child operator&(const Child other) const {
-    return vec_and(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child operator^(const Child other) const {
-    return vec_xor(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child bit_andnot(const Child other) const {
-    return vec_andc(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child &operator|=(const Child other) {
-    auto this_cast = static_cast<Child*>(this);
-    *this_cast = *this_cast | other;
-    return *this_cast;
-  }
-  simdutf_really_inline Child &operator&=(const Child other) {
-    auto this_cast = static_cast<Child*>(this);
-    *this_cast = *this_cast & other;
-    return *this_cast;
-  }
-  simdutf_really_inline Child &operator^=(const Child other) {
-    auto this_cast = static_cast<Child*>(this);
-    *this_cast = *this_cast ^ other;
-    return *this_cast;
-  }
+template<typename Child> struct base {
+    __m128i value;
+
+    // Zero constructor
+    simdutf_really_inline base()
+        : value { __m128i() }
+    {
+    }
+
+    // Conversion from SIMD register
+    simdutf_really_inline base(const __m128i _value)
+        : value(_value)
+    {
+    }
+
+    // Conversion to SIMD register
+    simdutf_really_inline operator const __m128i&() const
+    {
+        return this->value;
+    }
+    simdutf_really_inline operator __m128i&() { return this->value; }
+
+    // Bit operations
+    simdutf_really_inline Child operator|(const Child other) const
+    {
+        return vec_or(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child operator&(const Child other) const
+    {
+        return vec_and(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child operator^(const Child other) const
+    {
+        return vec_xor(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child bit_andnot(const Child other) const
+    {
+        return vec_andc(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child& operator|=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator&=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator^=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
 };
 
 // Forward-declared so they can be used by splat and friends.
-template <typename T> struct simd8;
+template<typename T> struct simd8;
 
-template <typename T, typename Mask = simd8<bool>>
+template<typename T, typename Mask = simd8<bool>>
 struct base8 : base<simd8<T>> {
-  typedef uint16_t bitmask_t;
-  typedef uint32_t bitmask2_t;
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
 
-  simdutf_really_inline base8() : base<simd8<T>>() {}
-  simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+    simdutf_really_inline base8()
+        : base<simd8<T>>()
+    {
+    }
+    simdutf_really_inline base8(const __m128i _value)
+        : base<simd8<T>>(_value)
+    {
+    }
 
-  simdutf_really_inline Mask operator==(const simd8<T> other) const {
-    return (__m128i)vec_cmpeq(this->value, (__m128i)other);
-  }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs)
+    {
+        return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
+    }
 
-  static const int SIZE = sizeof(base<simd8<T>>::value);
+    static const int SIZE = sizeof(base<simd8<T>>::value);
 
-  template <int N = 1>
-  simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
-    __m128i chunk = this->value;
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const
+    {
+        __m128i chunk = this->value;
 #ifdef __LITTLE_ENDIAN__
-    chunk = (__m128i)vec_reve(this->value);
-    prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
+        chunk = (__m128i)vec_reve(this->value);
+        prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
 #endif
-    chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
+        chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
 #ifdef __LITTLE_ENDIAN__
-    chunk = (__m128i)vec_reve((__m128i)chunk);
+        chunk = (__m128i)vec_reve((__m128i)chunk);
 #endif
-    return chunk;
-  }
+        return chunk;
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
-template <> struct simd8<bool> : base8<bool> {
-  static simdutf_really_inline simd8<bool> splat(bool _value) {
-    return (__m128i)vec_splats((unsigned char)(-(!!_value)));
-  }
-
-  simdutf_really_inline simd8<bool>() : base8() {}
-  simdutf_really_inline simd8<bool>(const __m128i _value)
-      : base8<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd8<bool>(bool _value)
-      : base8<bool>(splat(_value)) {}
-
-  simdutf_really_inline int to_bitmask() const {
-    __vector unsigned long long result;
-    const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
-                               0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
-
-    result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
-                                                       (__m128i)perm_mask));
+template<> struct simd8<bool> : base8<bool> {
+    static simdutf_really_inline simd8<bool> splat(bool _value)
+    {
+        return (__m128i)vec_splats((unsigned char)(-(!!_value)));
+    }
+
+    simdutf_really_inline simd8<bool>()
+        : base8()
+    {
+    }
+    simdutf_really_inline simd8<bool>(const __m128i _value)
+        : base8<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd8<bool>(bool _value)
+        : base8<bool>(splat(_value))
+    {
+    }
+
+    simdutf_really_inline int to_bitmask() const
+    {
+        __vector unsigned long long result;
+        const __m128i perm_mask = { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+            0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 };
+
+        result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
+            (__m128i)perm_mask));
 #ifdef __LITTLE_ENDIAN__
-    return static_cast<int>(result[1]);
+        return static_cast<int>(result[1]);
 #else
-    return static_cast<int>(result[0]);
+        return static_cast<int>(result[0]);
 #endif
-  }
-  simdutf_really_inline bool any() const {
-    return !vec_all_eq(this->value, (__m128i)vec_splats(0));
-  }
-  simdutf_really_inline simd8<bool> operator~() const {
-    return this->value ^ (__m128i)splat(true);
-  }
+    }
+    simdutf_really_inline bool any() const
+    {
+        return !vec_all_eq(this->value, (__m128i)vec_splats(0));
+    }
+    simdutf_really_inline simd8<bool> operator~() const
+    {
+        return this->value ^ (__m128i)splat(true);
+    }
 };
 
-template <typename T> struct base8_numeric : base8<T> {
-  static simdutf_really_inline simd8<T> splat(T value) {
-    (void)value;
-    return (__m128i)vec_splats(value);
-  }
-  static simdutf_really_inline simd8<T> zero() { return splat(0); }
-  static simdutf_really_inline simd8<T> load(const T values[16]) {
-    return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
-  }
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
-                                                   T v5, T v6, T v7, T v8, T v9,
-                                                   T v10, T v11, T v12, T v13,
-                                                   T v14, T v15) {
-    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-                    v14, v15);
-  }
-
-  simdutf_really_inline base8_numeric() : base8<T>() {}
-  simdutf_really_inline base8_numeric(const __m128i _value)
-      : base8<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[16]) const {
-    vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
-  }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
-    return (__m128i)((__m128i)this->value + (__m128i)other);
-  }
-  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
-    return (__m128i)((__m128i)this->value - (__m128i)other);
-  }
-  simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
-    *this = *this + other;
-    return *static_cast<simd8<T> *>(this);
-  }
-  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
-    *this = *this - other;
-    return *static_cast<simd8<T> *>(this);
-  }
-
-  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
-  // for out of range values)
-  template <typename L>
-  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-    return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
-  }
-
-  template <typename L>
-  simdutf_really_inline simd8<L>
-  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
-            L replace5, L replace6, L replace7, L replace8, L replace9,
-            L replace10, L replace11, L replace12, L replace13, L replace14,
-            L replace15) const {
-    return lookup_16(simd8<L>::repeat_16(
-        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
-        replace7, replace8, replace9, replace10, replace11, replace12,
-        replace13, replace14, replace15));
-  }
+template<typename T> struct base8_numeric : base8<T> {
+    static simdutf_really_inline simd8<T> splat(T value)
+    {
+        (void)value;
+        return (__m128i)vec_splats(value);
+    }
+    static simdutf_really_inline simd8<T> zero() { return splat(0); }
+    static simdutf_really_inline simd8<T> load(const T values[16])
+    {
+        return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t*>(values)));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+        T v5, T v6, T v7, T v8, T v9,
+        T v10, T v11, T v12, T v13,
+        T v14, T v15)
+    {
+        return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+            v14, v15);
+    }
+
+    simdutf_really_inline base8_numeric()
+        : base8<T>()
+    {
+    }
+    simdutf_really_inline base8_numeric(const __m128i _value)
+        : base8<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[16]) const
+    {
+        vec_vsx_st(this->value, 0, reinterpret_cast<__m128i*>(dst));
+    }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd8<T> operator+(const simd8<T> other) const
+    {
+        return (__m128i)((__m128i)this->value + (__m128i)other);
+    }
+    simdutf_really_inline simd8<T> operator-(const simd8<T> other) const
+    {
+        return (__m128i)((__m128i)this->value - (__m128i)other);
+    }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd8<T>*>(this);
+    }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd8<T>*>(this);
+    }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+    // for out of range values)
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
+    }
+
+    template<typename L>
+    simdutf_really_inline simd8<L>
+    lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+        L replace5, L replace6, L replace7, L replace8, L replace9,
+        L replace10, L replace11, L replace12, L replace13, L replace14,
+        L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+            replace7, replace8, replace9, replace10, replace11, replace12,
+            replace13, replace14, replace15));
+    }
 };
 
 // Signed bytes
-template <> struct simd8<int8_t> : base8_numeric<int8_t> {
-  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
-  simdutf_really_inline simd8(const __m128i _value)
-      : base8_numeric<int8_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
-  // Member-by-member initialization
-  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
-                               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-                               int8_t v8, int8_t v9, int8_t v10, int8_t v11,
-                               int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-      : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
-                                              v8, v9, v10, v11, v12, v13, v14,
-                                              v15}) {}
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  simdutf_really_inline static simd8<int8_t>
-  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
-            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
-            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
-    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-                         v13, v14, v15);
-  }
-
-  // Order-sensitive comparisons
-  simdutf_really_inline simd8<int8_t>
-  max_val(const simd8<int8_t> other) const {
-    return (__m128i)vec_max((__vector signed char)this->value,
-                            (__vector signed char)(__m128i)other);
-  }
-  simdutf_really_inline simd8<int8_t>
-  min_val(const simd8<int8_t> other) const {
-    return (__m128i)vec_min((__vector signed char)this->value,
-                            (__vector signed char)(__m128i)other);
-  }
-  simdutf_really_inline simd8<bool>
-  operator>(const simd8<int8_t> other) const {
-    return (__m128i)vec_cmpgt((__vector signed char)this->value,
-                              (__vector signed char)(__m128i)other);
-  }
-  simdutf_really_inline simd8<bool>
-  operator<(const simd8<int8_t> other) const {
-    return (__m128i)vec_cmplt((__vector signed char)this->value,
-                              (__vector signed char)(__m128i)other);
-  }
+template<> struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<int8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<int8_t>(_value)
+    {
+    }
+
+    // Splat constructor
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd8(const int8_t* values)
+        : simd8(load(values))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+        int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+        int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8((__m128i)(__vector signed char) { v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14,
+            v15 })
+    {
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<int8_t>
+    repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+        int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+        int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+            v13, v14, v15);
+    }
+
+    // Order-sensitive comparisons
+    simdutf_really_inline simd8<int8_t>
+    max_val(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_max((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
+    simdutf_really_inline simd8<int8_t>
+    min_val(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_min((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
+    simdutf_really_inline simd8<bool>
+    operator>(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_cmpgt((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
+    simdutf_really_inline simd8<bool>
+    operator<(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_cmplt((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
 };
 
 // Unsigned bytes
-template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
-  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
-  simdutf_really_inline simd8(const __m128i _value)
-      : base8_numeric<uint8_t>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
-  // Member-by-member initialization
-  simdutf_really_inline
-  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+template<> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<uint8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<uint8_t>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd8(const uint8_t* values)
+        : simd8(load(values))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline
+    simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
         uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
         uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-      : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-                        v13, v14, v15}) {}
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  simdutf_really_inline static simd8<uint8_t>
-  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
-            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
-            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
-            uint8_t v15) {
-    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-                          v13, v14, v15);
-  }
-
-  // Saturated math
-  simdutf_really_inline simd8<uint8_t>
-  saturating_add(const simd8<uint8_t> other) const {
-    return (__m128i)vec_adds(this->value, (__m128i)other);
-  }
-  simdutf_really_inline simd8<uint8_t>
-  saturating_sub(const simd8<uint8_t> other) const {
-    return (__m128i)vec_subs(this->value, (__m128i)other);
-  }
-
-  // Order-specific operations
-  simdutf_really_inline simd8<uint8_t>
-  max_val(const simd8<uint8_t> other) const {
-    return (__m128i)vec_max(this->value, (__m128i)other);
-  }
-  simdutf_really_inline simd8<uint8_t>
-  min_val(const simd8<uint8_t> other) const {
-    return (__m128i)vec_min(this->value, (__m128i)other);
-  }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd8<uint8_t>
-  gt_bits(const simd8<uint8_t> other) const {
-    return this->saturating_sub(other);
-  }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd8<uint8_t>
-  lt_bits(const simd8<uint8_t> other) const {
-    return other.saturating_sub(*this);
-  }
-  simdutf_really_inline simd8<bool>
-  operator<=(const simd8<uint8_t> other) const {
-    return other.max_val(*this) == other;
-  }
-  simdutf_really_inline simd8<bool>
-  operator>=(const simd8<uint8_t> other) const {
-    return other.min_val(*this) == other;
-  }
-  simdutf_really_inline simd8<bool>
-  operator>(const simd8<uint8_t> other) const {
-    return this->gt_bits(other).any_bits_set();
-  }
-  simdutf_really_inline simd8<bool>
-  operator<(const simd8<uint8_t> other) const {
-    return this->gt_bits(other).any_bits_set();
-  }
-
-  // Bit-specific operations
-  simdutf_really_inline simd8<bool> bits_not_set() const {
-    return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
-  }
-  simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
-    return (*this & bits).bits_not_set();
-  }
-  simdutf_really_inline simd8<bool> any_bits_set() const {
-    return ~this->bits_not_set();
-  }
-  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
-    return ~this->bits_not_set(bits);
-  }
-
-  simdutf_really_inline bool is_ascii() const {
-      return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
-  }
-
-  simdutf_really_inline bool bits_not_set_anywhere() const {
-    return vec_all_eq(this->value, (__m128i)vec_splats(0));
-  }
-  simdutf_really_inline bool any_bits_set_anywhere() const {
-    return !bits_not_set_anywhere();
-  }
-  simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
-    return vec_all_eq(vec_and(this->value, (__m128i)bits),
-                      (__m128i)vec_splats(0));
-  }
-  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
-    return !bits_not_set_anywhere(bits);
-  }
-  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
-    return simd8<uint8_t>(
-        (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
-  }
-  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
-    return simd8<uint8_t>(
-        (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
-  }
+        : simd8((__m128i) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+            v13, v14, v15 })
+    {
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<uint8_t>
+    repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+        uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+        uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+        uint8_t v15)
+    {
+        return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+            v13, v14, v15);
+    }
+
+    // Saturated math
+    simdutf_really_inline simd8<uint8_t>
+    saturating_add(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_adds(this->value, (__m128i)other);
+    }
+    simdutf_really_inline simd8<uint8_t>
+    saturating_sub(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_subs(this->value, (__m128i)other);
+    }
+
+    // Order-specific operations
+    simdutf_really_inline simd8<uint8_t>
+    max_val(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_max(this->value, (__m128i)other);
+    }
+    simdutf_really_inline simd8<uint8_t>
+    min_val(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_min(this->value, (__m128i)other);
+    }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t>
+    gt_bits(const simd8<uint8_t> other) const
+    {
+        return this->saturating_sub(other);
+    }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t>
+    lt_bits(const simd8<uint8_t> other) const
+    {
+        return other.saturating_sub(*this);
+    }
+    simdutf_really_inline simd8<bool>
+    operator<=(const simd8<uint8_t> other) const
+    {
+        return other.max_val(*this) == other;
+    }
+    simdutf_really_inline simd8<bool>
+    operator>=(const simd8<uint8_t> other) const
+    {
+        return other.min_val(*this) == other;
+    }
+    simdutf_really_inline simd8<bool>
+    operator>(const simd8<uint8_t> other) const
+    {
+        return this->gt_bits(other).any_bits_set();
+    }
+    simdutf_really_inline simd8<bool>
+    operator<(const simd8<uint8_t> other) const
+    {
+        return this->gt_bits(other).any_bits_set();
+    }
+
+    // Bit-specific operations
+    simdutf_really_inline simd8<bool> bits_not_set() const
+    {
+        return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
+    }
+    simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const
+    {
+        return (*this & bits).bits_not_set();
+    }
+    simdutf_really_inline simd8<bool> any_bits_set() const
+    {
+        return ~this->bits_not_set();
+    }
+    simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const
+    {
+        return ~this->bits_not_set(bits);
+    }
+
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
+    }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const
+    {
+        return vec_all_eq(this->value, (__m128i)vec_splats(0));
+    }
+    simdutf_really_inline bool any_bits_set_anywhere() const
+    {
+        return !bits_not_set_anywhere();
+    }
+    simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const
+    {
+        return vec_all_eq(vec_and(this->value, (__m128i)bits),
+            (__m128i)vec_splats(0));
+    }
+    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const
+    {
+        return !bits_not_set_anywhere(bits);
+    }
+    template<int N> simdutf_really_inline simd8<uint8_t> shr() const
+    {
+        return simd8<uint8_t>(
+            (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
+    }
+    template<int N> simdutf_really_inline simd8<uint8_t> shl() const
+    {
+        return simd8<uint8_t>(
+            (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
+    }
 };
 
-template <typename T> struct simd8x64 {
-  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
-  static_assert(NUM_CHUNKS == 4,
-                "PPC64 kernel should use four registers per 64-byte block.");
-  simd8<T> chunks[NUM_CHUNKS];
+template<typename T> struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 4,
+        "PPC64 kernel should use four registers per 64-byte block.");
+    simd8<T> chunks[NUM_CHUNKS];
 
-  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
-  simd8x64<T> &
-  operator=(const simd8<T> other) = delete; // no assignment allowed
-  simd8x64() = delete;                      // no default constructor allowed
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>&
+    operator=(const simd8<T> other)
+        = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
 
-  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
-                                  const simd8<T> chunk2, const simd8<T> chunk3)
-      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+        const simd8<T> chunk2, const simd8<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
 
-  simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
+    {
+    }
 
-  simdutf_really_inline void store(T* ptr) const {
-    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
-    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
-    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
-    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
-  }
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    }
 
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        this->chunks[2] |= other.chunks[2];
+        this->chunks[3] |= other.chunks[3];
+        return *this;
+    }
 
-  simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      this->chunks[2] |= other.chunks[2];
-      this->chunks[3] |= other.chunks[3];
-      return *this;
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-  simdutf_really_inline simd8<T> reduce_or() const {
-    return (this->chunks[0] | this->chunks[1]) |
-           (this->chunks[2] | this->chunks[3]);
-  }
+    simdutf_really_inline bool is_ascii() const
+    {
+        return input.reduce_or().is_ascii();
+    }
 
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r1 = this->chunks[1].to_bitmask();
+        uint64_t r2 = this->chunks[2].to_bitmask();
+        uint64_t r3 = this->chunks[3].to_bitmask();
+        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
 
-  simdutf_really_inline bool is_ascii() const {
-    return input.reduce_or().is_ascii();
-  }
-
-  simdutf_really_inline uint64_t to_bitmask() const {
-    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
-    uint64_t r1 = this->chunks[1].to_bitmask();
-    uint64_t r2 = this->chunks[2].to_bitmask();
-    uint64_t r3 = this->chunks[3].to_bitmask();
-    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-  }
-
-  simdutf_really_inline uint64_t eq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
-                          this->chunks[2] == mask, this->chunks[3] == mask)
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
-    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
-                          this->chunks[1] == other.chunks[1],
-                          this->chunks[2] == other.chunks[2],
-                          this->chunks[3] == other.chunks[3])
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t lteq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
-                          this->chunks[2] <= mask, this->chunks[3] <= mask)
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-      return  simd8x64<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t lt(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
-                          this->chunks[2] < mask, this->chunks[3] < mask)
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask,
-        this->chunks[2] > mask,
-        this->chunks[3] > mask
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask,
-        this->chunks[2] >= mask,
-        this->chunks[3] >= mask
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        simd8<uint8_t>(this->chunks[0]) >= mask,
-        simd8<uint8_t>(this->chunks[1]) >= mask,
-        simd8<uint8_t>(this->chunks[2]) >= mask,
-        simd8<uint8_t>(this->chunks[3]) >= mask
-      ).to_bitmask();
-  }
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+            this->chunks[2] == mask, this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
+    {
+        return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1],
+            this->chunks[2] == other.chunks[2],
+            this->chunks[3] == other.chunks[3])
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+            this->chunks[2] <= mask, this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+        return simd8x64<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+            this->chunks[2] < mask, this->chunks[3] < mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask,
+            this->chunks[2] > mask,
+            this->chunks[3] > mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask,
+            this->chunks[2] >= mask,
+            this->chunks[3] >= mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            simd8<uint8_t>(this->chunks[0]) >= mask,
+            simd8<uint8_t>(this->chunks[1]) >= mask,
+            simd8<uint8_t>(this->chunks[2]) >= mask,
+            simd8<uint8_t>(this->chunks[3]) >= mask)
+            .to_bitmask();
+    }
 }; // struct simd8x64<T>
 
 } // namespace simd
@@ -3935,7 +4706,7 @@ template <typename T> struct simd8x64 {
 #endif // SIMDUTF_PPC64_SIMD_INPUT_H
 /* end file src/simdutf/ppc64/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/end.h
 /* begin file src/simdutf/ppc64/end.h */
 /* end file src/simdutf/ppc64/end.h */
 
@@ -3943,12 +4714,11 @@ template <typename T> struct simd8x64 {
 
 #endif // SIMDUTF_PPC64_H
 /* end file src/simdutf/ppc64.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback.h
 /* begin file src/simdutf/fallback.h */
 #ifndef SIMDUTF_FALLBACK_H
 #define SIMDUTF_FALLBACK_H
 
-
 // Note that fallback.h is always imported last.
 
 // Default Fallback to on unless a builtin implementation has already been selected.
@@ -3972,12 +4742,11 @@ namespace fallback {
 } // namespace fallback
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/implementation.h
 /* begin file src/simdutf/fallback/implementation.h */
 #ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
 #define SIMDUTF_FALLBACK_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace fallback {
 
@@ -3987,64 +4756,88 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation(
-      "fallback",
-      "Generic fallback implementation",
-      0
-  ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation(
+            "fallback",
+            "Generic fallback implementation",
+            0)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace fallback
@@ -4053,14 +4846,14 @@ public:
 #endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
 /* end file src/simdutf/fallback/implementation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/begin.h
 /* begin file src/simdutf/fallback/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
 // #define SIMDUTF_IMPLEMENTATION fallback
 /* end file src/simdutf/fallback/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
 /* begin file src/simdutf/fallback/bitmanipulation.h */
 #ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
 #define SIMDUTF_FALLBACK_BITMANIPULATION_H
@@ -4072,19 +4865,21 @@ namespace fallback {
 namespace {
 
 #if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
-static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
-  unsigned long x0 = (unsigned long)x, top, bottom;
-  _BitScanForward(&top, (unsigned long)(x >> 32));
-  _BitScanForward(&bottom, x0);
-  *ret = x0 ? bottom : 32 + top;
-  return x != 0;
-}
-static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
-  unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
-  _BitScanReverse(&top, x1);
-  _BitScanReverse(&bottom, (unsigned long)x);
-  *ret = x1 ? top + 32 : bottom;
-  return x != 0;
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x)
+{
+    unsigned long x0 = (unsigned long)x, top, bottom;
+    _BitScanForward(&top, (unsigned long)(x >> 32));
+    _BitScanForward(&bottom, x0);
+    *ret = x0 ? bottom : 32 + top;
+    return x != 0;
+}
+static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x)
+{
+    unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
+    _BitScanReverse(&top, x1);
+    _BitScanReverse(&bottom, (unsigned long)x);
+    *ret = x1 ? top + 32 : bottom;
+    return x != 0;
 }
 #endif
 
@@ -4095,7 +4890,7 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
 #endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
 /* end file src/simdutf/fallback/bitmanipulation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/end.h
 /* begin file src/simdutf/fallback/end.h */
 /* end file src/simdutf/fallback/end.h */
 
@@ -4104,16 +4899,20 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
 /* end file src/simdutf/fallback.h */
 
 namespace simdutf {
-bool implementation::supported_by_runtime_system() const {
-  uint32_t required_instruction_sets = this->required_instruction_sets();
-  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
-  return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
+bool implementation::supported_by_runtime_system() const
+{
+    uint32_t required_instruction_sets = this->required_instruction_sets();
+    uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+    return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
 }
 
-simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char* input, size_t length) const noexcept
+{
     // If there is a BOM, then we trust it.
     auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
     // UTF8 is common, it includes ASCII, and is commonly represented
     // without a BOM, so if it fits, go with that. Note that it is still
     // possible to get it wrong, we are only 'guessing'. If some has UTF-16
@@ -4121,15 +4920,21 @@ simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char
     //
     // An interesting twist might be to check for UTF-16 ASCII first (every
     // other byte is zero).
-    if(validate_utf8(input, length)) { return encoding_type::UTF8; }
+    if (validate_utf8(input, length)) {
+        return encoding_type::UTF8;
+    }
     // The next most common encoding that might appear without BOM is probably
     // UTF-16LE, so try that next.
-    if((length % 2) == 0) {
-      // important: we need to divide by two
-      if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
+    if ((length % 2) == 0) {
+        // important: we need to divide by two
+        if (validate_utf16le(reinterpret_cast<const char16_t*>(input), length / 2)) {
+            return encoding_type::UTF16_LE;
+        }
     }
-    if((length % 4) == 0) {
-      if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
+    if ((length % 4) == 0) {
+        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
+            return encoding_type::UTF32_LE;
+        }
     }
     return encoding_type::unspecified;
 }
@@ -4139,24 +4944,47 @@ namespace internal {
 // Static array of known implementations. We're hoping these get baked into the executable
 // without requiring a static initializer.
 
-
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-const icelake::implementation icelake_singleton{};
+static const icelake::implementation* get_icelake_singleton()
+{
+    static const icelake::implementation icelake_singleton {};
+    return &icelake_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-const haswell::implementation haswell_singleton{};
+static const haswell::implementation* get_haswell_singleton()
+{
+    static const haswell::implementation haswell_singleton {};
+    return &haswell_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-const westmere::implementation westmere_singleton{};
+static const westmere::implementation* get_westmere_singleton()
+{
+    static const westmere::implementation westmere_singleton {};
+    return &westmere_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_ARM64
-const arm64::implementation arm64_singleton{};
+static const arm64::implementation* get_arm64_singleton()
+{
+    static const arm64::implementation arm64_singleton {};
+    return &arm64_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-const ppc64::implementation ppc64_singleton{};
+static const ppc64::implementation* get_ppc64_singleton()
+{
+    static const ppc64::implementation ppc64_singleton {};
+    return &ppc64_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-const fallback::implementation fallback_singleton{};
+static const fallback::implementation* get_fallback_singleton()
+{
+    static const fallback::implementation fallback_singleton {};
+    return &fallback_singleton;
+}
 #endif
 
 /**
@@ -4164,831 +4992,1275 @@ const fallback::implementation fallback_singleton{};
  */
 class detect_best_supported_implementation_on_first_use final : public implementation {
 public:
-  const std::string &name() const noexcept final { return set_best()->name(); }
-  const std::string &description() const noexcept final { return set_best()->description(); }
-  uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+    const std::string& name() const noexcept final { return set_best()->name(); }
+    const std::string& description() const noexcept final { return set_best()->description(); }
+    uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept override
+    {
+        return set_best()->detect_encodings(input, length);
+    }
+
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf8(buf, len);
+    }
+
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf8_with_errors(buf, len);
+    }
+
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_ascii(buf, len);
+    }
+
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_ascii_with_errors(buf, len);
+    }
+
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16le(buf, len);
+    }
+
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16be(buf, len);
+    }
+
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16le_with_errors(buf, len);
+    }
 
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
-    return set_best()->detect_encodings(input, length);
-  }
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16be_with_errors(buf, len);
+    }
 
-  simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8(buf, len);
-  }
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf32(buf, len);
+    }
 
-  simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8_with_errors(buf, len);
-  }
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf32_with_errors(buf, len);
+    }
 
-  simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii_with_errors(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16le(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16le_with_errors(buf, len);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16be_with_errors(buf, len);
-  }
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf32(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf32_with_errors(buf, len);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
+    }
 
-  void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override {
-    set_best()->change_endianness_utf16(buf, len, output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16le(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
+    }
+
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
+    }
+
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
-    return set_best()->count_utf8(buf, len);
-  }
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16le(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16le(buf, len);
-  }
+    void change_endianness_utf16(const char16_t* buf, size_t len, char16_t* output) const noexcept final override
+    {
+        set_best()->change_endianness_utf16(buf, len, output);
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->count_utf16le(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf8(buf, len);
-  }
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->count_utf16be(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf32(buf, len);
-  }
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->count_utf8(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf32(buf, len);
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->latin1_length_from_utf8(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf8(buf, len);
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override
+    {
+        return set_best()->latin1_length_from_utf16(len);
+    }
 
-  simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override
+    {
+        return set_best()->latin1_length_from_utf32(len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_latin1(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_utf16le(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_utf16be(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override
+    {
+        return set_best()->utf16_length_from_latin1(len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_latin1(len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_utf16le(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_utf16be(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf16_length_from_utf8(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_utf32(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf16_length_from_utf32(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_utf8(buf, len);
+    }
+
+    simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept
+        : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0)
+    {
+    }
 
 private:
-  const implementation *set_best() const noexcept;
+    const implementation* set_best() const noexcept;
 };
 
-
-const std::initializer_list<const implementation *> available_implementation_pointers {
+static const std::initializer_list<const implementation*>& get_available_implementation_pointers()
+{
+    static const std::initializer_list<const implementation*> available_implementation_pointers
+    {
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-  &icelake_singleton,
+        get_icelake_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-  &haswell_singleton,
+            get_haswell_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-  &westmere_singleton,
+            get_westmere_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_ARM64
-  &arm64_singleton,
+            get_arm64_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-  &ppc64_singleton,
+            get_ppc64_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-  &fallback_singleton,
+            get_fallback_singleton(),
 #endif
-}; // available_implementation_pointers
+    }; // available_implementation_pointers
+    return available_implementation_pointers;
+}
 
 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
 class unsupported_implementation final : public implementation {
 public:
-  simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override {
-    return encoding_type::unspecified;
-  }
-
-  simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
-    return false; // Just refuse to validate. Given that we have a fallback implementation
-    // it seems unlikely that unsupported_implementation will ever be used. If it is used,
-    // then it will flag all strings as invalid. The alternative is to return an error_code
-    // from which the user has to figure out whether the string is valid UTF-8... which seems
-    // like a lot of work just to handle the very unlikely case that we have an unsupported
-    // implementation. And, when it does happen (that we have an unsupported implementation),
-    // what are the chances that the programmer has a fallback? Given that *we* provide the
-    // fallback, it implies that the programmer would need a fallback for our fallback.
-  }
-
-  simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
-
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused int detect_encodings(const char*, size_t) const noexcept override
+    {
+        return encoding_type::unspecified;
+    }
+
+    simdutf_warn_unused bool validate_utf8(const char*, size_t) const noexcept final override
+    {
+        return false; // Just refuse to validate. Given that we have a fallback implementation
+        // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+        // then it will flag all strings as invalid. The alternative is to return an error_code
+        // from which the user has to figure out whether the string is valid UTF-8... which seems
+        // like a lot of work just to handle the very unlikely case that we have an unsupported
+        // implementation. And, when it does happen (that we have an unsupported implementation),
+        // what are the chances that the programmer has a fallback? Given that *we* provide the
+        // fallback, it implies that the programmer would need a fallback for our fallback.
+    }
+
+    simdutf_warn_unused result validate_utf8_with_errors(const char*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused bool validate_ascii(const char*, size_t) const noexcept final override
+    {
+        return false;
+    }
+
+    simdutf_warn_unused result validate_ascii_with_errors(const char*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override
+    {
+        return false;
+    }
+
+    simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override
+    {
+        return false;
+    }
+
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override
+    {
+        return false;
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    void change_endianness_utf16(const char16_t*, size_t, char16_t*) const noexcept final override
+    {
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t count_utf16le(const char16_t*, size_t) const noexcept final override
+    {
+        return 0;
+    }
 
-  void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override {
+    simdutf_warn_unused size_t count_utf16be(const char16_t*, size_t) const noexcept final override
+    {
+        return 0;
+    }
 
-  }
+    simdutf_warn_unused size_t count_utf8(const char*, size_t) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override
+    {
+        return 0;
+    }
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override
+    {
+        return 0;
+    }
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
+    unsupported_implementation()
+        : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0)
+    {
+    }
 };
 
-const unsupported_implementation unsupported_singleton{};
+const unsupported_implementation unsupported_singleton {};
 
-size_t available_implementation_list::size() const noexcept {
-  return internal::available_implementation_pointers.size();
+size_t available_implementation_list::size() const noexcept
+{
+    return internal::get_available_implementation_pointers().size();
 }
-const implementation * const *available_implementation_list::begin() const noexcept {
-  return internal::available_implementation_pointers.begin();
+const implementation* const* available_implementation_list::begin() const noexcept
+{
+    return internal::get_available_implementation_pointers().begin();
 }
-const implementation * const *available_implementation_list::end() const noexcept {
-  return internal::available_implementation_pointers.end();
+const implementation* const* available_implementation_list::end() const noexcept
+{
+    return internal::get_available_implementation_pointers().end();
 }
-const implementation *available_implementation_list::detect_best_supported() const noexcept {
-  // They are prelisted in priority order, so we just go down the list
-  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
-  for (const implementation *impl : internal::available_implementation_pointers) {
-    uint32_t required_instruction_sets = impl->required_instruction_sets();
-    if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
-  }
-  return &unsupported_singleton; // this should never happen?
+const implementation* available_implementation_list::detect_best_supported() const noexcept
+{
+    // They are prelisted in priority order, so we just go down the list
+    uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+    for (const implementation* impl : internal::get_available_implementation_pointers()) {
+        uint32_t required_instruction_sets = impl->required_instruction_sets();
+        if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) {
+            return impl;
+        }
+    }
+    return &unsupported_singleton; // this should never happen?
 }
 
-const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
-  SIMDUTF_PUSH_DISABLE_WARNINGS
-  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
-  char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
-  SIMDUTF_POP_DISABLE_WARNINGS
+const implementation* detect_best_supported_implementation_on_first_use::set_best() const noexcept
+{
+    SIMDUTF_PUSH_DISABLE_WARNINGS
+    SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+        char* force_implementation_name
+        = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+    SIMDUTF_POP_DISABLE_WARNINGS
 
-  if (force_implementation_name) {
-    auto force_implementation = get_available_implementations()[force_implementation_name];
-    if (force_implementation) {
-      return get_active_implementation() = force_implementation;
-    } else {
-      // Note: abort() and stderr usage within the library is forbidden.
-      return get_active_implementation() = &unsupported_singleton;
+    if (force_implementation_name) {
+        auto force_implementation = get_available_implementations()[force_implementation_name];
+        if (force_implementation) {
+            return get_active_implementation() = force_implementation;
+        } else {
+            // Note: abort() and stderr usage within the library is forbidden.
+            return get_active_implementation() = &unsupported_singleton;
+        }
     }
-  }
-  return get_active_implementation() = get_available_implementations().detect_best_supported();
+    return get_active_implementation() = get_available_implementations().detect_best_supported();
 }
 
 } // namespace internal
 
-
-
 /**
  * The list of available implementations compiled into simdutf.
  */
-SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
-  static const internal::available_implementation_list available_implementations{};
-  return available_implementations;
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations()
+{
+    static const internal::available_implementation_list available_implementations {};
+    return available_implementations;
 }
 
 /**
-  * The active implementation.
-  */
-SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
+ * The active implementation.
+ */
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation()
+{
     static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
-    static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
+    static internal::atomic_ptr<const implementation> active_implementation { &detect_best_supported_implementation_on_first_use_singleton };
     return active_implementation;
 }
 
-simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf8(buf, len);
+simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf8(buf, len);
 }
-simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf8_with_errors(buf, len);
 }
-simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_ascii(buf, len);
+simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_ascii(buf, len);
 }
-simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_ascii_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be(input, length, utf16_output);
-  #else
-  return convert_utf8_to_utf16le(input, length, utf16_output);
-  #endif
+simdutf_warn_unused size_t convert_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf8_to_utf16be(input, length, utf16_output);
+#else
+    return convert_utf8_to_utf16le(input, length, utf16_output);
+#endif
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
+simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
+simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
-  #else
-  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
-  #endif
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+#else
+    return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+#endif
 }
-simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
+simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_latin1_to_utf16le(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
+simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_latin1_to_utf16be(input, length, utf16_output);
 }
-simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be(buf, len);
-  #else
-  return validate_utf16le(buf, len);
-  #endif
+simdutf_warn_unused size_t convert_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
 }
-simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16le(buf, len);
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* input, size_t length, char32_t* utf32_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
 }
-simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16be(buf, len);
+simdutf_warn_unused bool validate_utf16(const char16_t* buf, size_t len) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return validate_utf16be(buf, len);
+#else
+    return validate_utf16le(buf, len);
+#endif
 }
-simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be_with_errors(buf, len);
-  #else
-  return validate_utf16le_with_errors(buf, len);
-  #endif
+simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16le(buf, len);
 }
-simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16le_with_errors(buf, len);
+simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16be(buf, len);
 }
-simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16be_with_errors(buf, len);
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t* buf, size_t len) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return validate_utf16be_with_errors(buf, len);
+#else
+    return validate_utf16le_with_errors(buf, len);
+#endif
 }
-simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf32(buf, len);
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16le_with_errors(buf, len);
 }
-simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf32_with_errors(buf, len);
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16be_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
-  #else
-  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
-  #endif
+simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf32(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf32_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+#else
+    return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
-  #else
-  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
-  #endif
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+    return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
 }
-simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
-  #else
-  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
-  #endif
-}
-simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
-}
-simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  #if BIG_ENDIAN
-  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
-  #else
-  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
-  #endif
-}
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
-  #else
-  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
-  #endif
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
-  #else
-  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
-  #endif
-}
-simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+#else
+    return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
-  #else
-  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
-  #endif
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+    return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
-  #else
-  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
-  #endif
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
-  #else
-  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
-  #endif
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+    return convert_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
-  #else
-  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
-  #endif
+simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_output) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
 }
-void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
-  get_active_implementation()->change_endianness_utf16(input, length, output);
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+#else
+    return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+#endif
 }
-simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return count_utf16be(input, length);
-  #else
-  return count_utf16le(input, length);
-  #endif
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf16le(input, length);
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf16be(input, length);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+    return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf8(input, length);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return utf8_length_from_utf16be(input, length);
-  #else
-  return utf8_length_from_utf16le(input, length);
-  #endif
+simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+    return convert_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf16le(input, length);
+simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+#else
+    return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+#endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf16be(input, length);
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return utf32_length_from_utf16be(input, length);
-  #else
-  return utf32_length_from_utf16le(input, length);
-  #endif
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf16le(input, length);
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+    return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
 }
-simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf16be(input, length);
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf32(input, length);
+void change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) noexcept
+{
+    get_active_implementation()->change_endianness_utf16(input, length, output);
 }
-simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t count_utf16(const char16_t* input, size_t length) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return count_utf16be(input, length);
+#else
+    return count_utf16le(input, length);
+#endif
 }
-simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf8(input, length);
+simdutf_warn_unused size_t count_utf16le(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->count_utf16le(input, length);
 }
-simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
-  return get_active_implementation()->autodetect_encoding(buf, length);
+simdutf_warn_unused size_t count_utf16be(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->count_utf16be(input, length);
 }
-simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
-  return get_active_implementation()->detect_encodings(buf, length);
+simdutf_warn_unused size_t count_utf8(const char* input, size_t length) noexcept
+{
+    return get_active_implementation()->count_utf8(input, length);
 }
-
-const implementation * builtin_implementation() {
-  static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
-  return builtin_impl;
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t* input, size_t length) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return utf8_length_from_utf16be(input, length);
+#else
+    return utf8_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf8_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf8_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t* input, size_t length) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return utf32_length_from_utf16be(input, length);
+#else
+    return utf32_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf32_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf32_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf16_length_from_utf8(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept
+{
+    return get_active_implementation()->utf16_length_from_latin1(length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf8_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf16_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf32_length_from_utf8(input, length);
+}
+simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char* buf, size_t length) noexcept
+{
+    return get_active_implementation()->autodetect_encoding(buf, length);
+}
+simdutf_warn_unused int detect_encodings(const char* buf, size_t length) noexcept
+{
+    return get_active_implementation()->detect_encodings(buf, length);
 }
 
+const implementation* builtin_implementation()
+{
+    static const implementation* builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
+    return builtin_impl;
+}
 
 } // namespace simdutf
 
 /* end file src/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=encoding_types.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=encoding_types.cpp
 /* begin file src/encoding_types.cpp */
 
 namespace simdutf {
-bool match_system(endianness e) {
+bool match_system(endianness e)
+{
 #if SIMDUTF_IS_BIG_ENDIAN
     return e == endianness::BIG;
 #else
@@ -4996,69 +6268,91 @@ bool match_system(endianness e) {
 #endif
 }
 
-std::string to_string(encoding_type bom) {
-  switch (bom) {
-      case UTF16_LE:     return "UTF16 little-endian";
-      case UTF16_BE:     return "UTF16 big-endian";
-      case UTF32_LE:     return "UTF32 little-endian";
-      case UTF32_BE:     return "UTF32 big-endian";
-      case UTF8:         return "UTF8";
-      case unspecified:  return "unknown";
-      default:           return "error";
-  }
+std::string to_string(encoding_type bom)
+{
+    switch (bom) {
+    case UTF16_LE:
+        return "UTF16 little-endian";
+    case UTF16_BE:
+        return "UTF16 big-endian";
+    case UTF32_LE:
+        return "UTF32 little-endian";
+    case UTF32_BE:
+        return "UTF32 big-endian";
+    case UTF8:
+        return "UTF8";
+    case unspecified:
+        return "unknown";
+    default:
+        return "error";
+    }
 }
 
 namespace BOM {
 // Note that BOM for UTF8 is discouraged.
-encoding_type check_bom(const uint8_t* byte, size_t length) {
-        if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
-            if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
-                return encoding_type::UTF32_LE;
-            } else {
-                return encoding_type::UTF16_LE;
-            }
-        } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
-            return encoding_type::UTF16_BE;
-        } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
-            return encoding_type::UTF32_BE;
-        } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
-            return encoding_type::UTF8;
+encoding_type check_bom(const uint8_t* byte, size_t length)
+{
+    if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+        if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+            return encoding_type::UTF32_LE;
+        } else {
+            return encoding_type::UTF16_LE;
         }
-        return encoding_type::unspecified;
+    } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+        return encoding_type::UTF16_BE;
+    } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
+        return encoding_type::UTF32_BE;
+    } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
+        return encoding_type::UTF8;
     }
+    return encoding_type::unspecified;
+}
 
-encoding_type check_bom(const char* byte, size_t length) {
-      return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
- }
-
- size_t bom_byte_size(encoding_type bom) {
-        switch (bom) {
-            case UTF16_LE:     return 2;
-            case UTF16_BE:     return 2;
-            case UTF32_LE:     return 4;
-            case UTF32_BE:     return 4;
-            case UTF8:         return 3;
-            case unspecified:  return 0;
-            default:           return 0;
-        }
+encoding_type check_bom(const char* byte, size_t length)
+{
+    return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
+}
+
+size_t bom_byte_size(encoding_type bom)
+{
+    switch (bom) {
+    case UTF16_LE:
+        return 2;
+    case UTF16_BE:
+        return 2;
+    case UTF32_LE:
+        return 4;
+    case UTF32_BE:
+        return 4;
+    case UTF8:
+        return 3;
+    case unspecified:
+        return 0;
+    default:
+        return 0;
+    }
 }
 
 }
 }
 /* end file src/encoding_types.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=error.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=error.cpp
 /* begin file src/error.cpp */
 namespace simdutf {
 
-  simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {};
+simdutf_really_inline result::result()
+    : error { error_code::SUCCESS }
+    , count { 0 } {};
 
-  simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {};
+simdutf_really_inline result::result(error_code _err, size_t _pos)
+    : error { _err }
+    , count { _pos } {};
 
 }
 /* end file src/error.cpp */
 // The large tables should be included once and they
 // should not depend on a kernel.
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=tables/utf8_to_utf16_tables.h
 /* begin file src/tables/utf8_to_utf16_tables.h */
 #ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
 #define SIMDUTF_UTF8_TO_UTF16_TABLES_H
@@ -5080,4316 +6374,4314 @@ namespace utf8_to_utf16 {
  * performance penalty.
  */
 
-const uint8_t shufutf8[209][16] =
-{	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
+const uint8_t shufutf8[209][16] = { { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0 } };
 /* number of two bytes : 64 */
 /* number of two + three bytes : 145 */
 /* number of two + three + four bytes : 209 */
-const uint8_t utf8bigindex[4096][2] =
-{	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{0, 12},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{171, 8},
- 	{72, 8},
- 	{183, 8},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{186, 8},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{187, 9},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{77, 7},
- 	{95, 7},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{192, 11},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{204, 11},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{207, 11},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{117, 11},
- 	{72, 8},
- 	{135, 11},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{141, 11},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{143, 11},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{31, 11},
- 	{47, 11},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{55, 11},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{59, 11},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{61, 11},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{62, 11},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{0, 12},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{208, 12},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{171, 8},
- 	{72, 8},
- 	{183, 8},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{186, 8},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{144, 12},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{77, 7},
- 	{95, 7},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{63, 12},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{192, 11},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{204, 11},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{207, 11},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{117, 11},
- 	{72, 8},
- 	{135, 11},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{141, 11},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{143, 11},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{31, 11},
- 	{47, 11},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{55, 11},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{59, 11},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{61, 11},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{62, 11},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6}};
+const uint8_t utf8bigindex[4096][2] = { { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 209, 12 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 171, 8 },
+    { 72, 8 },
+    { 183, 8 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 186, 8 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 187, 9 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 77, 7 },
+    { 95, 7 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 192, 11 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 204, 11 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 207, 11 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 117, 11 },
+    { 72, 8 },
+    { 135, 11 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 141, 11 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 143, 11 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 31, 11 },
+    { 47, 11 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 55, 11 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 59, 11 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 61, 11 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 62, 11 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 209, 12 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 208, 12 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 171, 8 },
+    { 72, 8 },
+    { 183, 8 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 186, 8 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 144, 12 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 77, 7 },
+    { 95, 7 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 63, 12 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 192, 11 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 204, 11 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 207, 11 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 117, 11 },
+    { 72, 8 },
+    { 135, 11 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 141, 11 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 143, 11 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 31, 11 },
+    { 47, 11 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 55, 11 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 59, 11 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 61, 11 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 62, 11 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 } };
 } // utf8_to_utf16 namespace
 } // tables namespace
 } // unnamed namespace
@@ -9397,7 +10689,7 @@ const uint8_t utf8bigindex[4096][2] =
 
 #endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
 /* end file src/tables/utf8_to_utf16_tables.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=tables/utf16_to_utf8_tables.h
 /* begin file src/tables/utf16_to_utf8_tables.h */
 // file generated by scripts/sse_convert_utf16_to_utf8.py
 #ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
@@ -9408,525 +10700,525 @@ namespace {
 namespace tables {
 namespace utf16_to_utf8 {
 
-  // 1 byte for length, 16 bytes for mask
-  const uint8_t pack_1_2_utf8_bytes[256][17] = {
-    {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
-    {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
-    {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
-    {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
-    {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
-    {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
-    {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
-    {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
-    {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
-    {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
-    {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
-    {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
-    {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
-    {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
-    {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
-    {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
-    {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
-  };
-
-  // 1 byte for length, 16 bytes for mask
-  const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
-    {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
-    {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
-    {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
-    {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
-    {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
-    {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
-  };
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_utf8_bytes[256][17] = {
+    { 16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 },
+    { 15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }
+};
+
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+    { 12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }
+};
 
 } // utf16_to_utf8 namespace
 } // tables namespace
@@ -9938,7 +11230,7 @@ namespace utf16_to_utf8 {
 // End of tables.
 
 // The scalar routines should be included once.
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/ascii.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/ascii.h
 /* begin file src/scalar/ascii.h */
 #ifndef SIMDUTF_ASCII_H
 #define SIMDUTF_ASCII_H
@@ -9949,45 +11241,55 @@ namespace {
 namespace ascii {
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 // Only used by the fallback kernel.
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+inline simdutf_warn_unused bool validate(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
     uint64_t pos = 0;
     // process in blocks of 16 bytes when possible
-    for (;pos + 16 < len; pos += 16) {
+    for (; pos + 16 < len; pos += 16) {
         uint64_t v1;
         std::memcpy(&v1, data + pos, sizeof(uint64_t));
         uint64_t v2;
         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-        uint64_t v{v1 | v2};
-        if ((v & 0x8080808080808080) != 0) { return false; }
+        uint64_t v { v1 | v2 };
+        if ((v & 0x8080808080808080) != 0) {
+            return false;
+        }
     }
     // process the tail byte-by-byte
-    for (;pos < len; pos ++) {
-        if (data[pos] >= 0b10000000) { return false; }
+    for (; pos < len; pos++) {
+        if (data[pos] >= 0b10000000) {
+            return false;
+        }
     }
     return true;
 }
 #endif
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
-    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+inline simdutf_warn_unused result validate_with_errors(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
     size_t pos = 0;
     // process in blocks of 16 bytes when possible
-    for (;pos + 16 < len; pos += 16) {
+    for (; pos + 16 < len; pos += 16) {
         uint64_t v1;
         std::memcpy(&v1, data + pos, sizeof(uint64_t));
         uint64_t v2;
         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-        uint64_t v{v1 | v2};
+        uint64_t v { v1 | v2 };
         if ((v & 0x8080808080808080) != 0) {
-            for (;pos < len; pos ++) {
-                if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
+            for (; pos < len; pos++) {
+                if (data[pos] >= 0b10000000) {
+                    return result(error_code::TOO_LARGE, pos);
+                }
             }
         }
     }
     // process the tail byte-by-byte
-    for (;pos < len; pos ++) {
-        if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
+    for (; pos < len; pos++) {
+        if (data[pos] >= 0b10000000) {
+            return result(error_code::TOO_LARGE, pos);
+        }
     }
     return result(error_code::SUCCESS, pos);
 }
@@ -9999,7 +11301,7 @@ inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t l
 
 #endif
 /* end file src/scalar/ascii.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8.h
 /* begin file src/scalar/utf8.h */
 #ifndef SIMDUTF_UTF8_H
 #define SIMDUTF_UTF8_H
@@ -10011,177 +11313,249 @@ namespace utf8 {
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 // only used by the fallback kernel.
 // credit: based on code from Google Fuchsia (Apache Licensed)
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  uint64_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 8 bytes are ascii.
-    uint64_t next_pos = pos + 16;
-    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
+inline simdutf_warn_unused bool validate(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    uint64_t pos = 0;
+    uint32_t code_point = 0;
+    while (pos < len) {
+        // check of the next 8 bytes are ascii.
+        uint64_t next_pos = pos + 16;
+        if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v1;
+            std::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                pos = next_pos;
+                continue;
+            }
+        }
+        unsigned char byte = data[pos];
+
+        while (byte < 0b10000000) {
+            if (++pos == len) {
+                return true;
+            }
+            byte = data[pos];
+        }
+
+        if ((byte & 0b11100000) == 0b11000000) {
+            next_pos = pos + 2;
+            if (next_pos > len) {
+                return false;
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            // range check
+            code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if ((code_point < 0x80) || (0x7ff < code_point)) {
+                return false;
+            }
+        } else if ((byte & 0b11110000) == 0b11100000) {
+            next_pos = pos + 3;
+            if (next_pos > len) {
+                return false;
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            // range check
+            code_point = (byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
+                return false;
+            }
+        } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+            next_pos = pos + 4;
+            if (next_pos > len) {
+                return false;
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            // range check
+            code_point = (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff || 0x10ffff < code_point) {
+                return false;
+            }
+        } else {
+            // we may have a continuation
+            return false;
+        }
         pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
-
-    while (byte < 0b10000000) {
-      if (++pos == len) { return true; }
-      byte = data[pos];
-    }
-
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) { return false; }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) { return false; }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point) ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return false;
-      }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) { return false; }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
-    } else {
-      // we may have a continuation
-      return false;
     }
-    pos = next_pos;
-  }
-  return true;
+    return true;
 }
 #endif
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 8 bytes are ascii.
-    size_t next_pos = pos + 16;
-    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
+inline simdutf_warn_unused result validate_with_errors(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    uint32_t code_point = 0;
+    while (pos < len) {
+        // check of the next 8 bytes are ascii.
+        size_t next_pos = pos + 16;
+        if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v1;
+            std::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                pos = next_pos;
+                continue;
+            }
+        }
+        unsigned char byte = data[pos];
+
+        while (byte < 0b10000000) {
+            if (++pos == len) {
+                return result(error_code::SUCCESS, len);
+            }
+            byte = data[pos];
+        }
+
+        if ((byte & 0b11100000) == 0b11000000) {
+            next_pos = pos + 2;
+            if (next_pos > len) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if ((code_point < 0x80) || (0x7ff < code_point)) {
+                return result(error_code::OVERLONG, pos);
+            }
+        } else if ((byte & 0b11110000) == 0b11100000) {
+            next_pos = pos + 3;
+            if (next_pos > len) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            code_point = (byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if ((code_point < 0x800) || (0xffff < code_point)) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xd7ff < code_point && code_point < 0xe000) {
+                return result(error_code::SURROGATE, pos);
+            }
+        } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+            next_pos = pos + 4;
+            if (next_pos > len) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            code_point = (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0x10ffff < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            } else {
+                return result(error_code::HEADER_BITS, pos);
+            }
+        }
         pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
-
-    while (byte < 0b10000000) {
-      if (++pos == len) { return result(error_code::SUCCESS, len); }
-      byte = data[pos];
-    }
-
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
-      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
-      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
-      else { return result(error_code::HEADER_BITS, pos); }
     }
-    pos = next_pos;
-  }
-  return result(error_code::SUCCESS, len);
+    return result(error_code::SUCCESS, len);
 }
 
 // Finds the previous leading byte and validates with errors from there
 // Used to pinpoint the location of an error when an invalid chunk is detected
-inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *buf, size_t len) noexcept {
-  size_t extra_len{0};
-  // A leading byte cannot be further than 4 bytes away
-  for(int i = 0; i < 5; i++) {
-    unsigned char byte = *buf;
-    if ((byte & 0b11000000) != 0b10000000) {
-      break;
-    } else {
-      buf--;
-      extra_len++;
+inline simdutf_warn_unused result rewind_and_validate_with_errors(const char* buf, size_t len) noexcept
+{
+    size_t extra_len { 0 };
+    // A leading byte cannot be further than 4 bytes away
+    for (int i = 0; i < 5; i++) {
+        unsigned char byte = *buf;
+        if ((byte & 0b11000000) != 0b10000000) {
+            break;
+        } else {
+            buf--;
+            extra_len++;
+        }
     }
-  }
 
-  result res = validate_with_errors(buf, len + extra_len);
-  res.count -= extra_len;
-  return res;
+    result res = validate_with_errors(buf, len + extra_len);
+    res.count -= extra_len;
+    return res;
 }
 
-inline size_t count_code_points(const char* buf, size_t len) {
-    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
-    size_t counter{0};
-    for(size_t i = 0; i < len; i++) {
+inline size_t count_code_points(const char* buf, size_t len)
+{
+    const int8_t* p = reinterpret_cast<const int8_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
         // -65 is 0b10111111, anything larger in two-complement's should start a new code point.
-        if(p[i] > -65) { counter++; }
+        if (p[i] > -65) {
+            counter++;
+        }
     }
     return counter;
 }
 
-inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
-    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
-    size_t counter{0};
-    for(size_t i = 0; i < len; i++) {
-        if(p[i] > -65) { counter++; }
-        if(uint8_t(p[i]) >= 240) { counter++; }
+inline size_t utf16_length_from_utf8(const char* buf, size_t len)
+{
+    const int8_t* p = reinterpret_cast<const int8_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        if (p[i] > -65) {
+            counter++;
+        }
+        if (uint8_t(p[i]) >= 240) {
+            counter++;
+        }
     }
     return counter;
 }
 
+inline size_t latin1_length_from_utf8(const char* buf, size_t len)
+{
+    const uint8_t* c = reinterpret_cast<const uint8_t*>(buf);
+
+    size_t answer = len;
+    for (size_t i = 0; i < len; i++) {
+        if ((c[i] & 0b11100000) == 0b11000000) {
+            answer--;
+        } // if we have a two-byte UTF8 character
+    }
+    return answer;
+}
+
 } // utf8 namespace
 } // unnamed namespace
 } // namespace scalar
@@ -10189,7 +11563,7 @@ inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
 
 #endif
 /* end file src/scalar/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16.h
 /* begin file src/scalar/utf16.h */
 #ifndef SIMDUTF_UTF16_H
 #define SIMDUTF_UTF16_H
@@ -10199,101 +11573,133 @@ namespace scalar {
 namespace {
 namespace utf16 {
 
-inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
-  return uint16_t((word >> 8) | (word << 8));
-}
-
-template <endianness big_endian>
-inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  uint64_t pos = 0;
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800) == 0xD800) {
-        if(pos + 1 >= len) { return false; }
-        uint16_t diff = uint16_t(word - 0xD800);
-        if(diff > 0x3FF) { return false; }
-        uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-        uint16_t diff2 = uint16_t(next_word - 0xDC00);
-        if(diff2 > 0x3FF) { return false; }
-        pos += 2;
-    } else {
-        pos++;
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word)
+{
+    return uint16_t((word >> 8) | (word << 8));
+}
+
+template<endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t* buf, size_t len) noexcept
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    uint64_t pos = 0;
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) == 0xD800) {
+            if (pos + 1 >= len) {
+                return false;
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return false;
+            }
+            uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return false;
+            }
+            pos += 2;
+        } else {
+            pos++;
+        }
     }
-  }
-  return true;
-}
-
-template <endianness big_endian>
-inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if((word & 0xF800) == 0xD800) {
-        if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
-        uint16_t diff = uint16_t(word - 0xD800);
-        if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
-        uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-        uint16_t diff2 = uint16_t(next_word - 0xDC00);
-        if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
-        pos += 2;
-    } else {
-        pos++;
+    return true;
+}
+
+template<endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t* buf, size_t len) noexcept
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) == 0xD800) {
+            if (pos + 1 >= len) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            pos += 2;
+        } else {
+            pos++;
+        }
+    }
+    return result(error_code::SUCCESS, pos);
+}
+
+template<endianness big_endian>
+inline size_t count_code_points(const char16_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+        counter += ((word & 0xFC00) != 0xDC00);
+    }
+    return counter;
+}
+
+template<endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+        /** ASCII **/
+        if (word <= 0x7F) {
+            counter++;
+        }
+        /** two-byte **/
+        else if (word <= 0x7FF) {
+            counter += 2;
+        }
+        /** three-byte **/
+        else if ((word <= 0xD7FF) || (word >= 0xE000)) {
+            counter += 3;
+        }
+        /** surrogates -- 4 bytes **/
+        else {
+            counter += 2;
+        }
+    }
+    return counter;
+}
+
+template<endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+        counter += ((word & 0xFC00) != 0xDC00);
+    }
+    return counter;
+}
+
+inline size_t latin1_length_from_utf16(size_t len)
+{
+    return len;
+}
+
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out)
+{
+    const uint16_t* input = reinterpret_cast<const uint16_t*>(in);
+    uint16_t* output = reinterpret_cast<uint16_t*>(out);
+    for (size_t i = 0; i < size; i++) {
+        *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
     }
-  }
-  return result(error_code::SUCCESS, pos);
-}
-
-template <endianness big_endian>
-inline size_t count_code_points(const char16_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
-  }
-  return counter;
-}
-
-template <endianness big_endian>
-inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    /** ASCII **/
-    if(word <= 0x7F) { counter++; }
-    /** two-byte **/
-    else if (word <= 0x7FF) { counter += 2; }
-    /** three-byte **/
-    else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; }
-    /** surrogates -- 4 bytes **/
-    else { counter += 2; }
-  }
-  return counter;
-}
-
-template <endianness big_endian>
-inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
-  }
-  return counter;
-}
-
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) {
-  const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
-  uint16_t * output = reinterpret_cast<uint16_t *>(out);
-  for (size_t i = 0; i < size; i++) {
-    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
-  }
 }
 
 } // utf16 namespace
@@ -10303,7 +11709,7 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 
 #endif
 /* end file src/scalar/utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32.h
 /* begin file src/scalar/utf32.h */
 #ifndef SIMDUTF_UTF32_H
 #define SIMDUTF_UTF32_H
@@ -10313,61 +11719,83 @@ namespace scalar {
 namespace {
 namespace utf32 {
 
-inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  uint64_t pos = 0;
-  for(;pos < len; pos++) {
-    uint32_t word = data[pos];
-    if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
-        return false;
+inline simdutf_warn_unused bool validate(const char32_t* buf, size_t len) noexcept
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    uint64_t pos = 0;
+    for (; pos < len; pos++) {
+        uint32_t word = data[pos];
+        if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline simdutf_warn_unused result validate_with_errors(const char32_t* buf, size_t len) noexcept
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    for (; pos < len; pos++) {
+        uint32_t word = data[pos];
+        if (word > 0x10FFFF) {
+            return result(error_code::TOO_LARGE, pos);
+        }
+        if (word >= 0xD800 && word <= 0xDFFF) {
+            return result(error_code::SURROGATE, pos);
+        }
     }
-  }
-  return true;
-}
-
-inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  for(;pos < len; pos++) {
-    uint32_t word = data[pos];
-    if(word > 0x10FFFF) {
-        return result(error_code::TOO_LARGE, pos);
-    }
-    if(word >= 0xD800 && word <= 0xDFFF) {
-        return result(error_code::SURROGATE, pos);
-    }
-  }
-  return result(error_code::SUCCESS, pos);
-}
-
-inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    /** ASCII **/
-    if(p[i] <= 0x7F) { counter++; }
-    /** two-byte **/
-    else if(p[i] <= 0x7FF) { counter += 2; }
-    /** three-byte **/
-    else if(p[i] <= 0xFFFF) { counter += 3; }
-    /** four-bytes **/
-    else { counter += 4; }
-  }
-  return counter;
-}
-
-inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    /** non-surrogate word **/
-    if(p[i] <= 0xFFFF) { counter++; }
-    /** surrogate pair **/
-    else { counter += 2; }
-  }
-  return counter;
+    return result(error_code::SUCCESS, pos);
+}
+
+inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        /** ASCII **/
+        if (p[i] <= 0x7F) {
+            counter++;
+        }
+        /** two-byte **/
+        else if (p[i] <= 0x7FF) {
+            counter += 2;
+        }
+        /** three-byte **/
+        else if (p[i] <= 0xFFFF) {
+            counter += 3;
+        }
+        /** four-bytes **/
+        else {
+            counter += 4;
+        }
+    }
+    return counter;
+}
+
+inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        /** non-surrogate word **/
+        if (p[i] <= 0xFFFF) {
+            counter++;
+        }
+        /** surrogate pair **/
+        else {
+            counter += 2;
+        }
+    }
+    return counter;
+}
+
+inline size_t latin1_length_from_utf32(size_t len)
+{
+    // We are not BOM aware.
+    return len; // a utf32 codepoint will always represent 1 latin1 character
 }
 
 } // utf32 namespace
@@ -10377,8 +11805,48 @@ inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
 
 #endif
 /* end file src/scalar/utf32.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1.h
+/* begin file src/scalar/latin1.h */
+#ifndef SIMDUTF_LATIN1_H
+#define SIMDUTF_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1 {
+
+inline size_t utf32_length_from_latin1(size_t len)
+{
+    // We are not BOM aware.
+    return len; // a utf32 unit will always represent 1 latin1 character
+}
+
+inline size_t utf8_length_from_latin1(const char* buf, size_t len)
+{
+    const uint8_t* c = reinterpret_cast<const uint8_t*>(buf);
+    size_t answer = 0;
+    for (size_t i = 0; i < len; i++) {
+        if ((c[i] >> 7)) {
+            answer++;
+        }
+    }
+    return answer + len;
+}
+
+inline size_t utf16_length_from_latin1(size_t len)
+{
+    return len;
+}
+
+} // utf32 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h
+#endif
+/* end file src/scalar/latin1.h */
+
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h
 /* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
 #ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
 #define SIMDUTF_VALID_UTF32_TO_UTF8_H
@@ -10390,51 +11858,52 @@ namespace utf32_to_utf8 {
 
 #if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 // only used by the fallback and POWER kernel
-inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
-	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
+inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 2 ASCII characters
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+                *utf8_output++ = char(buf[pos]);
+                *utf8_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t word = data[pos];
+        if ((word & 0xFFFFFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xFFFFF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xFFFF0000) == 0) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 18) | 0b11110000);
+            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        }
     }
-  }
-  return utf8_output - start;
+    return utf8_output - start;
 }
 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 
@@ -10445,7 +11914,7 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output)
 
 #endif
 /* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h
 /* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
 #ifndef SIMDUTF_UTF32_TO_UTF8_H
 #define SIMDUTF_UTF32_TO_UTF8_H
@@ -10455,102 +11924,112 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf8 {
 
-inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-			if (word > 0x10FFFF) { return 0; }
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
-    }
-  }
-  return utf8_output - start;
-}
-
-inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-			if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-			if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
+inline size_t convert(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 2 ASCII characters
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+                *utf8_output++ = char(buf[pos]);
+                *utf8_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t word = data[pos];
+        if ((word & 0xFFFFFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xFFFFF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xFFFF0000) == 0) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return 0;
+            }
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            if (word > 0x10FFFF) {
+                return 0;
+            }
+            *utf8_output++ = char((word >> 18) | 0b11110000);
+            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        }
+    }
+    return utf8_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 2 ASCII characters
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+                *utf8_output++ = char(buf[pos]);
+                *utf8_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t word = data[pos];
+        if ((word & 0xFFFFFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xFFFFF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xFFFF0000) == 0) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            if (word > 0x10FFFF) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            *utf8_output++ = char((word >> 18) | 0b11110000);
+            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf8_output - start);
+    return result(error_code::SUCCESS, utf8_output - start);
 }
 
 } // utf32_to_utf8 namespace
@@ -10561,7 +12040,7 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_ou
 #endif
 /* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h
 /* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
 #ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
 #define SIMDUTF_VALID_UTF32_TO_UTF16_H
@@ -10571,32 +12050,33 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if((word & 0xFFFF0000)==0) {
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-      pos++;
-    } else {
-      // will generate a surrogate pair
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos++;
-    }
-  }
-  return utf16_output - start;
+template<endianness big_endian>
+inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        uint32_t word = data[pos];
+        if ((word & 0xFFFF0000) == 0) {
+            // will not generate a surrogate pair
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+            pos++;
+        } else {
+            // will generate a surrogate pair
+            word -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos++;
+        }
+    }
+    return utf16_output - start;
 }
 
 } // utf32_to_utf16 namespace
@@ -10606,7 +12086,7 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_out
 
 #endif
 /* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h
 /* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
 #ifndef SIMDUTF_UTF32_TO_UTF16_H
 #define SIMDUTF_UTF32_TO_UTF16_H
@@ -10616,62 +12096,72 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if((word & 0xFFFF0000)==0) {
-      if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-    } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) { return 0; }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-    }
-    pos++;
-  }
-  return utf16_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if((word & 0xFFFF0000)==0) {
-      if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-    } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-    }
-    pos++;
-  }
-  return result(error_code::SUCCESS, utf16_output - start);
+template<endianness big_endian>
+inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        uint32_t word = data[pos];
+        if ((word & 0xFFFF0000) == 0) {
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return 0;
+            }
+            // will not generate a surrogate pair
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+        } else {
+            // will generate a surrogate pair
+            if (word > 0x10FFFF) {
+                return 0;
+            }
+            word -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+        }
+        pos++;
+    }
+    return utf16_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        uint32_t word = data[pos];
+        if ((word & 0xFFFF0000) == 0) {
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            // will not generate a surrogate pair
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+        } else {
+            // will generate a surrogate pair
+            if (word > 0x10FFFF) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            word -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+        }
+        pos++;
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
 }
 
 } // utf32_to_utf16 namespace
@@ -10682,7 +12172,7 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf
 #endif
 /* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
 /* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
 #ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
 #define SIMDUTF_VALID_UTF16_TO_UTF8_H
@@ -10692,62 +12182,67 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf8 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 4 ASCII characters
-    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while(pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word & 0xFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word &0xF800 ) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value>>18) | 0b11110000);
-      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return utf8_output - start;
+template<endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 4 ASCII characters
+        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if (!match_system(big_endian)) {
+                v = (v >> 8) | (v << (64 - 8));
+            }
+            if ((v & 0xFF80FF80FF80FF80) == 0) {
+                size_t final_pos = pos + 4;
+                while (pos < final_pos) {
+                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xF800) != 0xD800) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((value >> 18) | 0b11110000);
+            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            pos += 2;
+        }
+    }
+    return utf8_output - start;
 }
 
 } // utf16_to_utf8 namespace
@@ -10757,7 +12252,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output)
 
 #endif
 /* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
 /* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
 #ifndef SIMDUTF_UTF16_TO_UTF8_H
 #define SIMDUTF_UTF16_TO_UTF8_H
@@ -10767,122 +12262,139 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf8 {
 
-template <endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII characters
-    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while(pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word & 0xFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word &0xF800 ) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if(pos + 1 >= len) { return 0; }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return 0; }
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return 0; }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value>>18) | 0b11110000);
-      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return utf8_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII characters
-    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while(pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word & 0xFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word &0xF800 ) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value>>18) | 0b11110000);
-      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return result(error_code::SUCCESS, utf8_output - start);
+template<endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII characters
+        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if (!match_system(big_endian)) {
+                v = (v >> 8) | (v << (64 - 8));
+            }
+            if ((v & 0xFF80FF80FF80FF80) == 0) {
+                size_t final_pos = pos + 4;
+                while (pos < final_pos) {
+                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xF800) != 0xD800) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            if (pos + 1 >= len) {
+                return 0;
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return 0;
+            }
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return 0;
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((value >> 18) | 0b11110000);
+            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            pos += 2;
+        }
+    }
+    return utf8_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII characters
+        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if (!match_system(big_endian))
+                v = (v >> 8) | (v << (64 - 8));
+            if ((v & 0xFF80FF80FF80FF80) == 0) {
+                size_t final_pos = pos + 4;
+                while (pos < final_pos) {
+                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xF800) != 0xD800) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            if (pos + 1 >= len) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((value >> 18) | 0b11110000);
+            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            pos += 2;
+        }
+    }
+    return result(error_code::SUCCESS, utf8_output - start);
 }
 
 } // utf16_to_utf8 namespace
@@ -10893,7 +12405,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_ou
 #endif
 /* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h
 /* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
 #ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
 #define SIMDUTF_VALID_UTF16_TO_UTF32_H
@@ -10903,29 +12415,32 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf32 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800 ) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+template<endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) != 0xD800) {
+            // No surrogate pair, extend 16-bit word to 32-bit word
+            *utf32_output++ = char32_t(word);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            *utf32_output++ = char32_t(value);
+            pos += 2;
+        }
     }
-  }
-  return utf32_output - start;
+    return utf32_output - start;
 }
 
 } // utf16_to_utf32 namespace
@@ -10935,7 +12450,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_out
 
 #endif
 /* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h
 /* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
 #ifndef SIMDUTF_UTF16_TO_UTF32_H
 #define SIMDUTF_UTF16_TO_UTF32_H
@@ -10945,58 +12460,72 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf32 {
 
-template <endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800 ) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return 0; }
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return 0; }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
-    }
-  }
-  return utf32_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800 ) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+template<endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) != 0xD800) {
+            // No surrogate pair, extend 16-bit word to 32-bit word
+            *utf32_output++ = char32_t(word);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return 0;
+            }
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return 0;
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            *utf32_output++ = char32_t(value);
+            pos += 2;
+        }
+    }
+    return utf32_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) != 0xD800) {
+            // No surrogate pair, extend 16-bit word to 32-bit word
+            *utf32_output++ = char32_t(word);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            if (pos + 1 >= len) {
+                return result(error_code::SURROGATE, pos);
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            *utf32_output++ = char32_t(value);
+            pos += 2;
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf32_output - start);
+    return result(error_code::SUCCESS, utf32_output - start);
 }
 
 } // utf16_to_utf32 namespace
@@ -11007,7 +12536,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf
 #endif
 /* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
 #ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
 #define SIMDUTF_VALID_UTF8_TO_UTF16_H
@@ -11017,74 +12546,80 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while(pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 1 >= len) { break; } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 2 >= len) { break; } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { break; } // minimal bound checking
-      uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
-                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
+template<endianness big_endian>
+inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII bytes
+        if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 8;
+                while (pos < final_pos) {
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 1 >= len) {
+                break;
+            } // minimal bound checking
+            uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) | (data[pos + 1] & 0b00111111));
+            if (!match_system(big_endian)) {
+                code_point = utf16::swap_bytes(uint16_t(code_point));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 2 >= len) {
+                break;
+            } // minimal bound checking
+            uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) | ((data[pos + 1] & 0b00111111) << 6) | (data[pos + 2] & 0b00111111));
+            if (!match_system(big_endian)) {
+                code_point = utf16::swap_bytes(uint16_t(code_point));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                break;
+            } // minimal bound checking
+            uint32_t code_point = ((leading_byte & 0b00000111) << 18) | ((data[pos + 1] & 0b00111111) << 12)
+                | ((data[pos + 2] & 0b00111111) << 6) | (data[pos + 3] & 0b00111111);
+            code_point -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos += 4;
+        } else {
+            // we may have a continuation but we do not do error checking
+            return 0;
+        }
     }
-  }
-  return utf16_output - start;
+    return utf16_output - start;
 }
 
-
 } // namespace utf8_to_utf16
 } // unnamed namespace
 } // namespace scalar
@@ -11092,7 +12627,7 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output)
 
 #endif
 /* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
 #ifndef SIMDUTF_UTF8_TO_UTF16_H
 #define SIMDUTF_UTF8_TO_UTF16_H
@@ -11102,184 +12637,230 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 2 >= len) { return 0; } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      return 0;
-    }
-  }
-  return utf16_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
-      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
-      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
-      else { return result(error_code::HEADER_BITS, pos); }
+template<endianness big_endian>
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return 0;
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 2 >= len) {
+                return 0;
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if (code_point < 0x800 || 0xffff < code_point || (0xd7ff < code_point && code_point < 0xe000)) {
+                return 0;
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff || 0x10ffff < code_point) {
+                return 0;
+            }
+            code_point -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos += 4;
+        } else {
+            return 0;
+        }
+    }
+    return utf16_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 1 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 2 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if ((code_point < 0x800) || (0xffff < code_point)) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xd7ff < code_point && code_point < 0xe000) {
+                return result(error_code::SURROGATE, pos);
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0x10ffff < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            code_point -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos += 4;
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            } else {
+                return result(error_code::HEADER_BITS, pos);
+            }
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf16_output - start);
+    return result(error_code::SUCCESS, utf16_output - start);
 }
 
 /**
@@ -11295,41 +12876,44 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o
  * If the error is believed to have occured prior to 'buf', the count value contain in the result
  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
  */
-template <endianness endian>
-inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  size_t how_far_back = 3; // 3 bytes in the past + current position
-  if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-i];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if(found_leading_bytes) {
-      buf -= i;
-      extra_len = i;
-      break;
-    }
-  }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
-  // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if(!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is continuation]
-    // Or we possibly have a stream that does not start with a leading byte.
-    return result(error_code::TOO_LONG, -how_far_back);
-  }
-  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
-  if (res.error) {
-    res.count -= extra_len;
-  }
-  return res;
+template<endianness endian>
+inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output)
+{
+    size_t extra_len { 0 };
+    // We potentially need to go back in time and find a leading byte.
+    // In theory '3' would be sufficient, but sometimes the error can go back quite far.
+    size_t how_far_back = prior_bytes;
+    // size_t how_far_back = 3; // 3 bytes in the past + current position
+    // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+    bool found_leading_bytes { false };
+    // important: it is i <= how_far_back and not 'i < how_far_back'.
+    for (size_t i = 0; i <= how_far_back; i++) {
+        unsigned char byte = buf[0 - i];
+        found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+        if (found_leading_bytes) {
+            buf -= i;
+            extra_len = i;
+            break;
+        }
+    }
+    //
+    // It is possible for this function to return a negative count in its result.
+    // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
+    // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
+    //
+    // An unsigned type will simply wrap round arithmetically (well defined).
+    //
+    if (!found_leading_bytes) {
+        // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+        // [....] [continuation] [continuation] [continuation] | [buf is continuation]
+        // Or we possibly have a stream that does not start with a leading byte.
+        return result(error_code::TOO_LONG, 0 - how_far_back);
+    }
+    result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
+    if (res.error) {
+        res.count -= extra_len;
+    }
+    return res;
 }
 
 } // utf8_to_utf16 namespace
@@ -11340,7 +12924,7 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
 #endif
 /* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
 #ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
 #define SIMDUTF_VALID_UTF8_TO_UTF32_H
@@ -11350,55 +12934,61 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf32 {
 
-inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while(pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if(pos + 1 >= len) { break; } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if(pos + 2 >= len) { break; } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { break; } // minimal bound checking
-      uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
-                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
-      *utf32_output++ = char32_t(code_word);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
+inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII bytes
+        if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 8;
+                while (pos < final_pos) {
+                    *utf32_output++ = char32_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf32_output++ = char32_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                break;
+            } // minimal bound checking
+            *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) | (data[pos + 1] & 0b00111111));
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            if (pos + 2 >= len) {
+                break;
+            } // minimal bound checking
+            *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) | ((data[pos + 1] & 0b00111111) << 6) | (data[pos + 2] & 0b00111111));
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                break;
+            } // minimal bound checking
+            uint32_t code_word = ((leading_byte & 0b00000111) << 18) | ((data[pos + 1] & 0b00111111) << 12)
+                | ((data[pos + 2] & 0b00111111) << 6) | (data[pos + 3] & 0b00111111);
+            *utf32_output++ = char32_t(code_word);
+            pos += 4;
+        } else {
+            // we may have a continuation but we do not do error checking
+            return 0;
+        }
     }
-  }
-  return utf32_output - start;
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace scalar
@@ -11406,7 +12996,7 @@ inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output)
 
 #endif
 /* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
 #ifndef SIMDUTF_UTF8_TO_UTF32_H
 #define SIMDUTF_UTF8_TO_UTF32_H
@@ -11416,149 +13006,195 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf32 {
 
-inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if(pos + 2 >= len) { return 0; } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
-    } else {
-      return 0;
-    }
-  }
-  return utf32_output - start;
-}
-
-inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); }
-      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);}
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
-      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
-      else { return result(error_code::HEADER_BITS, pos); }
+inline size_t convert(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf32_output++ = char32_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf32_output++ = char32_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return 0;
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            if (pos + 2 >= len) {
+                return 0;
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if (code_point < 0x800 || 0xffff < code_point || (0xd7ff < code_point && code_point < 0xe000)) {
+                return 0;
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff || 0x10ffff < code_point) {
+                return 0;
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 4;
+        } else {
+            return 0;
+        }
+    }
+    return utf32_output - start;
+}
+
+inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf32_output++ = char32_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf32_output++ = char32_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return result(error_code::OVERLONG, pos);
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            if (pos + 2 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if (code_point < 0x800 || 0xffff < code_point) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xd7ff < code_point && code_point < 0xe000) {
+                return result(error_code::SURROGATE, pos);
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0x10ffff < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 4;
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            } else {
+                return result(error_code::HEADER_BITS, pos);
+            }
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf32_output - start);
+    return result(error_code::SUCCESS, utf32_output - start);
 }
 
 /**
@@ -11574,41 +13210,44 @@ inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_o
  * If the error is believed to have occured prior to 'buf', the count value contain in the result
  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
  */
-inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  size_t how_far_back = 3; // 3 bytes in the past + current position
-  if(how_far_back > prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-i];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if(found_leading_bytes) {
-      buf -= i;
-      extra_len = i;
-      break;
-    }
-  }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
-  // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if(!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is continuation]
-    // Or we possibly have a stream that does not start with a leading byte.
-    return result(error_code::TOO_LONG, -how_far_back);
-  }
-
-  result res = convert_with_errors(buf, len + extra_len, utf32_output);
-  if (res.error) {
-    res.count -= extra_len;
-  }
-  return res;
+inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output)
+{
+    size_t extra_len { 0 };
+    // We potentially need to go back in time and find a leading byte.
+    size_t how_far_back = 3; // 3 bytes in the past + current position
+    if (how_far_back > prior_bytes) {
+        how_far_back = prior_bytes;
+    }
+    bool found_leading_bytes { false };
+    // important: it is i <= how_far_back and not 'i < how_far_back'.
+    for (size_t i = 0; i <= how_far_back; i++) {
+        unsigned char byte = buf[0 - i];
+        found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+        if (found_leading_bytes) {
+            buf -= i;
+            extra_len = i;
+            break;
+        }
+    }
+    //
+    // It is possible for this function to return a negative count in its result.
+    // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
+    // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
+    //
+    // An unsigned type will simply wrap round arithmetically (well defined).
+    //
+    if (!found_leading_bytes) {
+        // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+        // [....] [continuation] [continuation] [continuation] | [buf is continuation]
+        // Or we possibly have a stream that does not start with a leading byte.
+        return result(error_code::TOO_LONG, 0 - how_far_back);
+    }
+
+    result res = convert_with_errors(buf, len + extra_len, utf32_output);
+    if (res.error) {
+        res.count -= extra_len;
+    }
+    return res;
 }
 
 } // utf8_to_utf32 namespace
@@ -11618,17 +13257,596 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
 
 #endif
 /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
-//
 
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1_to_utf8/latin1_to_utf8.h
+/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF8_H
+#define SIMDUTF_LATIN1_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf8 {
+
+inline size_t convert(const char* buf, size_t len, char* utf8_output)
+{
+    const unsigned char* data = reinterpret_cast<const unsigned char*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf8_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        unsigned char byte = data[pos];
+        if ((byte & 0x80) == 0) { // if ASCII
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(byte);
+            pos++;
+        } else {
+            // will generate two UTF-8 bytes
+            *utf8_output++ = char((byte >> 6) | 0b11000000);
+            *utf8_output++ = char((byte & 0b111111) | 0b10000000);
+            pos++;
+        }
+    }
+    return utf8_output - start;
+}
+
+} // latin1_to_utf8 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1_to_utf16/latin1_to_utf16.h
+/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF16_H
+#define SIMDUTF_LATIN1_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf16 {
+
+template<endianness big_endian>
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+
+    while (pos < len) {
+        uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+        *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+        pos++;
+    }
+
+    return utf16_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+
+    while (pos < len) {
+        uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+        *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+        pos++;
+    }
+
+    return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // latin1_to_utf16 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1_to_utf32/latin1_to_utf32.h
+/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF32_H
+#define SIMDUTF_LATIN1_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf32 {
+
+inline size_t convert(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const unsigned char* data = reinterpret_cast<const unsigned char*>(buf);
+    char32_t* start { utf32_output };
+    for (size_t i = 0; i < len; i++) {
+        *utf32_output++ = (char32_t)data[i];
+    }
+    return utf32_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char32_t* start { utf32_output };
+    for (size_t i = 0; i < len; i++) {
+        *utf32_output++ = (char32_t)data[i];
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
+}
+
+} // latin1_to_utf32 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_latin1/utf8_to_latin1.h
+/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+#ifndef SIMDUTF_UTF8_TO_LATIN1_H
+#define SIMDUTF_UTF8_TO_LATIN1_H
+#include <iostream>
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert(const char* buf, size_t len, char* latin_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000 .... etc
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        // suppose it is not an all ASCII byte sequence
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *latin_output++ = char(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+            // range check -
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+            if (0xFF < code_point) {
+                return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters
+            }
+            *latin_output++ = char(code_point);
+            pos += 2;
+        } else {
+            return 0;
+        }
+    }
+    return latin_output - start;
+}
+
+inline result convert_with_errors(const char* buf, size_t len, char* latin_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000...etc
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        // suppose it is not an all ASCII byte sequence
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *latin_output++ = char(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+            // range check -
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+            if (code_point < 0x80) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xFF < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            } // We only care about the range 129-255 which is Non-ASCII latin1 characters
+            *latin_output++ = char(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            return result(error_code::TOO_LARGE, pos);
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            return result(error_code::TOO_LARGE, pos);
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            }
+
+            return result(error_code::HEADER_BITS, pos);
+        }
+    }
+    return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // utf8_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_latin1/utf16_to_latin1.h
+/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+#ifndef SIMDUTF_UTF16_TO_LATIN1_H
+#define SIMDUTF_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+#include <cstring> // for std::memcpy
+
+template<endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char* latin_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    std::vector<char> temp_output(len);
+    char* current_write = temp_output.data();
+    uint16_t word = 0;
+    uint16_t too_large = 0;
+
+    while (pos < len) {
+        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        too_large |= word;
+        *current_write++ = char(word & 0xFF);
+        pos++;
+    }
+    if ((too_large & 0xFF00) != 0) {
+        return 0;
+    }
+
+    // Only copy to latin_output if there were no errors
+    std::memcpy(latin_output, temp_output.data(), len);
+
+    return current_write - temp_output.data();
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+    uint16_t word;
+
+    while (pos < len) {
+        if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1
+            uint64_t v1, v2, v3, v4;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
+            ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
+            ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
+
+            if (!match_system(big_endian)) {
+                v1 = (v1 >> 8) | (v1 << (64 - 8));
+            }
+            if (!match_system(big_endian)) {
+                v2 = (v2 >> 8) | (v2 << (64 - 8));
+            }
+            if (!match_system(big_endian)) {
+                v3 = (v3 >> 8) | (v3 << (64 - 8));
+            }
+            if (!match_system(big_endian)) {
+                v4 = (v1 >> 8) | (v4 << (64 - 8));
+            }
+
+            if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF00) == 0) {
+            *latin_output++ = char(word & 0xFF);
+            pos++;
+        } else {
+            return result(error_code::TOO_LARGE, pos);
+        }
+    }
+    return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // utf16_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_latin1/utf32_to_latin1.h
+/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+#ifndef SIMDUTF_UTF32_TO_LATIN1_H
+#define SIMDUTF_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert(const char32_t* buf, size_t len, char* latin1_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char* start = latin1_output;
+    uint32_t utf32_char;
+    size_t pos = 0;
+    uint32_t too_large = 0;
+
+    while (pos < len) {
+        utf32_char = (uint32_t)data[pos];
+        too_large |= utf32_char;
+        *latin1_output++ = (char)(utf32_char & 0xFF);
+        pos++;
+    }
+    if ((too_large & 0xFFFFFF00) != 0) {
+        return 0;
+    }
+    return latin1_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char* latin1_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char* start { latin1_output };
+    size_t pos = 0;
+    while (pos < len) {
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+                *latin1_output++ = char(buf[pos]);
+                *latin1_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t utf32_char = data[pos];
+        if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1
+            *latin1_output++ = (char)(utf32_char & 0xFF);
+            pos++;
+        } else {
+            return result(error_code::TOO_LARGE, pos);
+        };
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
+}
+
+} // utf32_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_latin1/valid_utf8_to_latin1.h
+/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert_valid(const char* buf, size_t len, char* latin_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+
+    size_t pos = 0;
+    char* start { latin_output };
+
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        // suppose it is not an all ASCII byte sequence
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *latin_output++ = char(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                break;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+            // range check -
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+            *latin_output++ = char(code_point);
+            pos += 2;
+        } else {
+            // we may have a continuation but we do not do error checking
+            return 0;
+        }
+    }
+    return latin_output - start;
+}
+
+} // utf8_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_latin1/valid_utf16_to_latin1.h
+/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+template<endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+    uint16_t word = 0;
+
+    while (pos < len) {
+        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        *latin_output++ = char(word);
+        pos++;
+    }
+
+    return latin_output - start;
+}
+
+} // utf16_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_latin1/valid_utf32_to_latin1.h
+/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert_valid(const char32_t* buf, size_t len, char* latin1_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char* start = latin1_output;
+    uint32_t utf32_char;
+    size_t pos = 0;
+
+    while (pos < len) {
+        utf32_char = (uint32_t)data[pos];
+
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+                *latin1_output++ = char(buf[pos]);
+                *latin1_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        *latin1_output++ = (char)(utf32_char & 0xFF);
+        pos++;
+    }
+    return latin1_output - start;
+}
+
+} // utf32_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
 
 SIMDUTF_PUSH_DISABLE_WARNINGS
 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 
-
 #if SIMDUTF_IMPLEMENTATION_ARM64
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/implementation.cpp
 /* begin file src/arm64/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/begin.h
 /* begin file src/simdutf/arm64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
 // #define SIMDUTF_IMPLEMENTATION arm64
@@ -11641,14 +13859,16 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
     simd8<uint8_t> bits = input.reduce_or();
     return bits.max_val() < 0b10000000u;
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
     simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
-    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+    simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
     // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
     // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
@@ -11658,17 +13878,19 @@ simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd
     return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
     return is_third_byte ^ is_fourth_byte;
 }
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_detect_encodings.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_detect_encodings.cpp
 /* begin file src/arm64/arm_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int arm_detect_encodings(const char * buf, size_t len) {
+int arm_detect_encodings(const char* buf, size_t len)
+{
     const char* start = buf;
     const char* end = buf + len;
 
@@ -11683,13 +13905,13 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
     uint32x4_t currentmax = vmovq_n_u32(0x0);
 
-    checker check{};
+    checker check {};
 
-    while(buf + 64 <= end) {
+    while (buf + 64 <= end) {
         uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
+        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2 * simd16<uint16_t>::SIZE / sizeof(char16_t));
+        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3 * simd16<uint16_t>::SIZE / sizeof(char16_t));
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(secondin);
@@ -11721,15 +13943,15 @@ int arm_detect_encodings(const char * buf, size_t len) {
                 is_utf32 = false;
                 // Code from arm_validate_utf16le.cpp
                 // Not efficient, we do not process surrogates_wordmask1
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
+                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint64_t V0 = ~surrogates_wordmask0;
 
-                const auto vH0 = ((in16 & v_fc) ==  v_dc);
+                const auto vH0 = ((in16 & v_fc) == v_dc);
                 const uint64_t H0 = vH0.to_bitmask64();
 
                 const uint64_t L0 = ~H0 & surrogates_wordmask0;
@@ -11756,12 +13978,12 @@ int arm_detect_encodings(const char * buf, size_t len) {
                     const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
 
                     const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
-                    if(surrogates_wordmask == 0) {
+                    if (surrogates_wordmask == 0) {
                         input += 16;
                     } else {
                         const uint64_t V = ~surrogates_wordmask;
 
-                        const auto vH = ((in_16 & v_fc) ==  v_dc);
+                        const auto vH = ((in_16 & v_fc) == v_dc);
                         const uint64_t H = vH.to_bitmask64();
 
                         const uint64_t L = ~H & surrogates_wordmask;
@@ -11785,23 +14007,23 @@ int arm_detect_encodings(const char * buf, size_t len) {
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
+                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
 
                     // Must start checking for surrogates
                     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
                     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
                     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
 
-                    const uint32x4_t in32 =  vreinterpretq_u32_u16(in);
-                    const uint32x4_t secondin32 =  vreinterpretq_u32_u16(secondin);
-                    const uint32x4_t thirdin32 =  vreinterpretq_u32_u16(thirdin);
-                    const uint32x4_t fourthin32 =  vreinterpretq_u32_u16(fourthin);
+                    const uint32x4_t in32 = vreinterpretq_u32_u16(in);
+                    const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin);
+                    const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin);
+                    const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin);
 
-                    currentmax = vmaxq_u32(in32,currentmax);
-                    currentmax = vmaxq_u32(secondin32,currentmax);
-                    currentmax = vmaxq_u32(thirdin32,currentmax);
-                    currentmax = vmaxq_u32(fourthin32,currentmax);
+                    currentmax = vmaxq_u32(in32, currentmax);
+                    currentmax = vmaxq_u32(secondin32, currentmax);
+                    currentmax = vmaxq_u32(thirdin32, currentmax);
+                    currentmax = vmaxq_u32(fourthin32, currentmax);
 
                     currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
                     currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
@@ -11810,13 +14032,13 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
                     while (input + 4 < end32) {
                         const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-                        currentmax = vmaxq_u32(in_32,currentmax);
+                        currentmax = vmaxq_u32(in_32, currentmax);
                         currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
                         input += 4;
                     }
 
                     uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(vmaxvq_u32(forbidden_words) != 0) {
+                    if (vmaxvq_u32(forbidden_words) != 0) {
                         is_utf32 = false;
                     }
                 } else {
@@ -11828,10 +14050,10 @@ int arm_detect_encodings(const char * buf, size_t len) {
         // If no surrogate, validate under other encodings as well
 
         // UTF-32 validation
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin), currentmax);
 
         // UTF-8 validation
         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
@@ -11845,7 +14067,7 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
+            uint8_t block[64] {};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -11856,14 +14078,14 @@ int arm_detect_encodings(const char * buf, size_t len) {
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
+        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -11872,10 +14094,11 @@ int arm_detect_encodings(const char * buf, size_t len) {
 }
 /* end file src/arm64/arm_detect_encodings.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_validate_utf16.cpp
 /* begin file src/arm64/arm_validate_utf16.cpp */
-template <endianness big_endian>
-const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
+{
     const char16_t* end = input + size;
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
@@ -11888,11 +14111,11 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
         auto in0 = simd16<uint16_t>(input);
         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
         if (!match_system(big_endian)) {
-            #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            #else
-            const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-            #endif
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
         }
@@ -11901,7 +14124,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-        if(surrogates_wordmask == 0) {
+        if (surrogates_wordmask == 0) {
             input += 16;
         } else {
             // 2. We have some surrogates that have to be distinguished:
@@ -11915,7 +14138,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
             const uint64_t V = ~surrogates_wordmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = ((in & v_fc) ==  v_dc);
+            const auto vH = ((in & v_fc) == v_dc);
             const uint64_t H = vH.to_bitmask64();
 
             // L - word mask for low surrogates
@@ -11923,11 +14146,11 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
             const uint64_t L = ~H & surrogates_wordmask;
 
             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
-                              // (A low surrogate placed in the 7th register's word
-                              // is an exception we handle.)
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
-                          // thanks to that we have only two masks for valid case.
-            const uint64_t c = V | a | b;      // Combine all the masks into the final one.
+                                       // thanks to that we have only two masks for valid case.
+            const uint64_t c = V | a | b; // Combine all the masks into the final one.
             if (c == ~0ull) {
                 // The whole input register contains valid UTF-16, i.e.,
                 // either single words or proper surrogate pairs.
@@ -11946,9 +14169,9 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
     return input;
 }
 
-
-template <endianness big_endian>
-const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
+{
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -11964,11 +14187,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
 
         if (!match_system(big_endian)) {
-            #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            #else
-            const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-            #endif
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
         }
@@ -11977,7 +14200,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-        if(surrogates_wordmask == 0) {
+        if (surrogates_wordmask == 0) {
             input += 16;
         } else {
             // 2. We have some surrogates that have to be distinguished:
@@ -11991,7 +14214,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint64_t V = ~surrogates_wordmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = ((in & v_fc) ==  v_dc);
+            const auto vH = ((in & v_fc) == v_dc);
             const uint64_t H = vH.to_bitmask64();
 
             // L - word mask for low surrogates
@@ -11999,11 +14222,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint64_t L = ~H & surrogates_wordmask;
 
             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
-                              // (A low surrogate placed in the 7th register's word
-                              // is an exception we handle.)
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
-                          // thanks to that we have only two masks for valid case.
-            const uint64_t c = V | a | b;      // Combine all the masks into the final one.
+                                       // thanks to that we have only two masks for valid case.
+            const uint64_t c = V | a | b; // Combine all the masks into the final one.
             if (c == ~0ull) {
                 // The whole input register contains valid UTF-16, i.e.,
                 // either single words or proper surrogate pairs.
@@ -12022,10 +14245,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
     return result(error_code::SUCCESS, input - start);
 }
 /* end file src/arm64/arm_validate_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
 /* begin file src/arm64/arm_validate_utf32le.cpp */
 
-const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
+const char32_t* arm_validate_utf32le(const char32_t* input, size_t size)
+{
     const char32_t* end = input + size;
 
     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
@@ -12036,26 +14260,26 @@ const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
 
     while (input + 4 < end) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-        currentmax = vmaxq_u32(in,currentmax);
+        currentmax = vmaxq_u32(in, currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
         input += 4;
     }
 
     uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-    if(vmaxvq_u32(is_zero) != 0) {
+    if (vmaxvq_u32(is_zero) != 0) {
         return nullptr;
     }
 
     is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if(vmaxvq_u32(is_zero) != 0) {
+    if (vmaxvq_u32(is_zero) != 0) {
         return nullptr;
     }
 
     return input;
 }
 
-
-const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size)
+{
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -12067,16 +14291,16 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
 
     while (input + 4 < end) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-        currentmax = vmaxq_u32(in,currentmax);
+        currentmax = vmaxq_u32(in, currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
 
         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if(vmaxvq_u32(is_zero) != 0) {
+        if (vmaxvq_u32(is_zero) != 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if(vmaxvq_u32(is_zero) != 0) {
+        if (vmaxvq_u32(is_zero) != 0) {
             return result(error_code::SURROGATE, input - start);
         }
 
@@ -12087,316 +14311,308 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
 }
 /* end file src/arm64/arm_validate_utf32le.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
 /* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  #else
-  const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-  #endif
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-    // We process in chunks of 16 bytes
-    uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in));
-    uint16x8_t ascii_second = vmovl_high_u8(in);
-    if (!match_system(big_endian)) {
-      ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
-      ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
-    }
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    uint8x16_t perm = vqtbl1q_u8(in, swap);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
-    vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
+template<endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char16_t*& utf16_output)
+{
+// we use an approach where we try to process up to 12 input bytes.
+// Why 12 input bytes and not 16? Because we are concerned with the size of
+// the lookup tables. Also 12 is nicely divisible by two and three.
+//
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+    const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-    const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
+    const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
-    if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
-    utf16_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-
-
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
-    vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
-    if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
-    utf16_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
-    uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
-    // correct for spurious high bit
-    uint8x16_t correct =
-        vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
-    middlehighbyte = veorq_u8(correct, middlehighbyte);
-    uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
-    uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
-    uint8x16_t composed =
-        vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
-                     vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
-    uint32x4_t composedminus =
-        vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
-    uint32x4_t lowtenbits =
-        vandq_u32(composedminus, vmovq_n_u32(0x3ff));
-    uint32x4_t hightenbits = vshrq_n_u32(composedminus, 10);
-    uint32x4_t lowtenbitsadd =
-        vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
-    uint32x4_t hightenbitsadd =
-        vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
-    uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
-    uint32x4_t surrogates =
-        vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (!match_system(big_endian)) {
-      vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
-      surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
-    }
-    vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
-    uint32_t surrogate_buffer[4];
-    vst1q_u32(surrogate_buffer, surrogates);
-    for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] < 65536) {
-        utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-        utf16_output++;
-      } else {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-        utf16_output += 2;
-      }
-    }
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+    uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+        // We process in chunks of 16 bytes
+        uint16x8_t ascii_first = vmovl_u8(vget_low_u8(in));
+        uint16x8_t ascii_second = vmovl_high_u8(in);
+        if (!match_system(big_endian)) {
+            ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
+            ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
+        utf16_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        uint8x16_t perm = vqtbl1q_u8(in, swap);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        if (!match_system(big_endian)) {
+            composed = vqtbl1q_u8(composed, swap);
+        }
+        vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
+        utf16_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+#else
+        const uint8x16_t sh = { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 };
+#endif
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
+        if (!match_system(big_endian)) {
+            composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
+        utf16_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes.
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        if (!match_system(big_endian)) {
+            composed = vqtbl1q_u8(composed, swap);
+        }
+        vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
+        utf16_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
+        if (!match_system(big_endian)) {
+            composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
+        utf16_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        //////////////
+        // There might be garbage inputs where a leading byte mascarades as a four-byte
+        // leading byte (by being followed by 3 continuation byte), but is not greater than
+        // 0xf0. This could trigger a buffer overflow if we only counted leading
+        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+        // We do as at the cost of an extra mask.
+        /////////////
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
+        // correct for spurious high bit
+        uint8x16_t correct = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
+        middlehighbyte = veorq_u8(correct, middlehighbyte);
+        uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
+        // We deliberately carry the leading four bits if they are present, we remove
+        // them later when computing hightenbits.
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0xff000000)));
+        uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
+        // When we need to generate a surrogate pair (leading byte > 0xF0), then
+        // the corresponding 32-bit value in 'composed'  will be greater than
+        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+        // location of the surrogate pairs.
+        uint8x16_t composed = vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
+            vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
+        uint32x4_t composedminus = vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
+        uint32x4_t lowtenbits = vandq_u32(composedminus, vmovq_n_u32(0x3ff));
+        // Notice the 0x3ff mask:
+        uint32x4_t hightenbits = vandq_u32(vshrq_n_u32(composedminus, 10), vmovq_n_u32(0x3ff));
+        uint32x4_t lowtenbitsadd = vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
+        uint32x4_t hightenbitsadd = vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
+        uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
+        uint32x4_t surrogates = vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
+        uint32_t basic_buffer[4];
+        uint32_t basic_buffer_swap[4];
+        if (!match_system(big_endian)) {
+            vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
+            surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
+        }
+        vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
+        uint32_t surrogate_buffer[4];
+        vst1q_u32(surrogate_buffer, surrogates);
+        for (size_t i = 0; i < 3; i++) {
+            if (basic_buffer[i] > 0x3c00000) {
+                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+                utf16_output += 2;
+            } else {
+                utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+                utf16_output++;
+            }
+        }
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
 /* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char32_t *&utf32_out) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xFFF;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-    // We process in chunks of 16 bytes
-    vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (in)))));
-    vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8 (in))));
-    vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
-    vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
-    utf32_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
+size_t convert_masked_utf8_to_utf32(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char32_t*& utf32_out)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
+    uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xFFF;
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+        // We process in chunks of 16 bytes
+        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(in)))));
+        vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8(in))));
+        vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
+        vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
+        utf32_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-    //const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-    const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        // const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        const uint8x16_t sh = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
-    vst1q_u32(utf32_output+4,  vmovl_high_u16(vreinterpretq_u16_u8(composed)));
-    utf32_output += 8; // We wrote 32 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
+        vst1q_u32(utf32_output + 4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
+        utf32_output += 8; // We wrote 32 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+        const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
 #else
-    const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
+        const uint8x16_t sh = { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 };
 #endif
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-
-
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
-    vst1q_u32(utf32_output+4,  vmovl_high_u16(vreinterpretq_u16_u8(composed)));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
-    uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
-    // correct for spurious high bit
-    uint8x16_t correct =
-        vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
-    middlehighbyte = veorq_u8(correct, middlehighbyte);
-    uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
-    uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
-    uint8x16_t composed =
-        vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
-                     vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
-    vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
-    utf32_output += 3;
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        vst1q_u32(utf32_output, composed);
+        utf32_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes.
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
+        vst1q_u32(utf32_output + 4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
+        utf32_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        vst1q_u32(utf32_output, composed);
+        utf32_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
+        // correct for spurious high bit
+        uint8x16_t correct = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
+        middlehighbyte = veorq_u8(correct, middlehighbyte);
+        uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
+        uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
+        uint8x16_t composed = vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
+            vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
+        vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
+        utf32_output += 3;
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
 /* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -12450,533 +14666,540 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-        // It is common enough that we have sequences of 16 consecutive ASCII characters.
-        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+template<endianness big_endian>
+std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         if (!match_system(big_endian)) {
-          #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          #else
-          const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-          #endif
-          nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
-        }
-        if(vmaxvq_u16(nextin) > 0x7F) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(in);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-          // 2. store (16 bytes)
-          vst1q_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    if (vmaxvq_u16(in) <= 0x7FF) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const uint16x8_t t0 = vshlq_n_u16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const uint16x8_t t2 = vandq_u16(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const uint16x8_t t3 = vorrq_u16(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-          // 2. merge ASCII and 2-byte codewords
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-          // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080);
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-          const uint16x8_t mask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080 };
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const uint8x16_t shuffle = vld1q_u8(row + 1);
-          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          vst1q_u8(utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+        if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+            // It is common enough that we have sequences of 16 consecutive ASCII characters.
+            uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 8);
+            if (!match_system(big_endian)) {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+                const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+                const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(in, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+                nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
+            }
+            if (vmaxvq_u16(nextin) > 0x7F) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(in);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+                // 2. store (16 bytes)
+                vst1q_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        if (vmaxvq_u16(in) <= 0x7FF) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const uint16x8_t t0 = vshlq_n_u16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const uint16x8_t t2 = vandq_u16(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const uint16x8_t t3 = vorrq_u16(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+            // 2. merge ASCII and 2-byte codewords
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+            // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 );
-        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 );
+            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080);
 #else
-        const uint16x8_t onemask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 };
-        const uint16x8_t twomask = { 0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 };
+            const uint16x8_t mask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080 };
 #endif
-        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const uint8x16_t shuffle = vld1q_u8(row + 1);
+            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            vst1q_u8(utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+            const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+#endif
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+            // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+            const uint16x8_t s0 = vshrq_n_u16(in, 12);
+            // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+            const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+            // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+            const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+            // [00bb|bbbb|0000|aaaa]
+            const uint16x8_t s2 = vorrq_u16(s0, s1s);
+            // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+            const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+            const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+            const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+            const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+            const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000);
+            const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000);
+#else
+            const uint16x8_t onemask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000 };
+            const uint16x8_t twomask = { 0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000 };
+#endif
+            const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+            const uint16_t mask = vaddvq_u16(combined);
+            // The following fast path may or may not be beneficial.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+              const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+              const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+              vst1q_u8(utf8_output, utf8_0);
+              utf8_output += 12;
+              vst1q_u8(utf8_output, utf8_1);
+              utf8_output += 12;
+              buf += 8;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+            vst1q_u8(utf8_output, utf8_0);
+            utf8_output += row0[0];
+            vst1q_u8(utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+    return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+template<endianness big_endian>
+std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
     const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-        // It is common enough that we have sequences of 16 consecutive ASCII characters.
-        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         if (!match_system(big_endian)) {
-          #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          #else
-          const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-          #endif
-          nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
-        }
-        if(vmaxvq_u16(nextin) > 0x7F) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(in);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-          // 2. store (16 bytes)
-          vst1q_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    if (vmaxvq_u16(in) <= 0x7FF) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const uint16x8_t t0 = vshlq_n_u16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const uint16x8_t t2 = vandq_u16(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const uint16x8_t t3 = vorrq_u16(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-          // 2. merge ASCII and 2-byte codewords
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-          // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080);
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+        if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+            // It is common enough that we have sequences of 16 consecutive ASCII characters.
+            uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 8);
+            if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+                const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+                nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
+            }
+            if (vmaxvq_u16(nextin) > 0x7F) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(in);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+                // 2. store (16 bytes)
+                vst1q_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        if (vmaxvq_u16(in) <= 0x7FF) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const uint16x8_t t0 = vshlq_n_u16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const uint16x8_t t2 = vandq_u16(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const uint16x8_t t3 = vorrq_u16(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+            // 2. merge ASCII and 2-byte codewords
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+            // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080);
 #else
-          const uint16x8_t mask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080 };
+            const uint16x8_t mask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080 };
 #endif
-          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const uint8x16_t shuffle = vld1q_u8(row + 1);
-          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          vst1q_u8(utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const uint8x16_t shuffle = vld1q_u8(row + 1);
+            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            vst1q_u8(utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+            const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 #else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+            const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
 #endif
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(in, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+            // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+            const uint16x8_t s0 = vshrq_n_u16(in, 12);
+            // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+            const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+            // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+            const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+            // [00bb|bbbb|0000|aaaa]
+            const uint16x8_t s2 = vorrq_u16(s0, s1s);
+            // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+            const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+            const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+            const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+            const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+            const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 );
-        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 );
+            const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000);
+            const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000);
 #else
-        const uint16x8_t onemask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 };
-        const uint16x8_t twomask = { 0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 };
+            const uint16x8_t onemask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000 };
+            const uint16x8_t twomask = { 0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000 };
 #endif
-        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+            const uint16_t mask = vaddvq_u16(combined);
+            // The following fast path may or may not be beneficial.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+              const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+              const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+              vst1q_u8(utf8_output, utf8_0);
+              utf8_output += 12;
+              vst1q_u8(utf8_output, utf8_1);
+              utf8_output += 12;
+              buf += 8;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+            vst1q_u8(utf8_output, utf8_0);
+            utf8_output += row0[0];
+            vst1q_u8(utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
 }
 /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
 /* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -13030,736 +15253,770 @@ std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* b
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit words to 32-bit words
-      vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out)
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
+        if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: no surrogate pairs, extend all 16-bit words to 32-bit words
+            vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+            vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
+    } // while
+    return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit words to 32-bit words
-      vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out)
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
+        if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: no surrogate pairs, extend all 16-bit words to 32-bit words
+            vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+            vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
 }
 /* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
 /* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-  const char32_t* end = buf + len;
-
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
-
-  while (buf + 16 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          continue; // we are done for this round!
-      }
-
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+    const char32_t* end = buf + len;
+
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+
+    while (buf + 16 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+        uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t*>(buf + 4));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+            // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+            // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+            uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+            if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                continue; // we are done for this round!
+            }
 
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const uint16x8_t t3 = vorrq_u16(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-            // 2. merge ASCII and 2-byte codewords
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-            // 3. prepare bitmask for 8-bit lookup
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080);
-  #else
-            const uint16x8_t mask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080 };
-  #endif
-            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const uint8x16_t shuffle = vld1q_u8(row + 1);
-            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+            if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+                // 1. prepare 2-byte values
+                // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+                // expected output   : [110a|aaaa|10bb|bbbb] x 8
+                const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+                const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+                // t0 = [000a|aaaa|bbbb|bb00]
+                const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+                // t1 = [000a|aaaa|0000|0000]
+                const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+                // t2 = [0000|0000|00bb|bbbb]
+                const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+                // t3 = [000a|aaaa|00bb|bbbb]
+                const uint16x8_t t3 = vorrq_u16(t1, t2);
+                // t4 = [110a|aaaa|10bb|bbbb]
+                const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+                // 2. merge ASCII and 2-byte codewords
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+                const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+                // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080);
+#else
+                const uint16x8_t mask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080 };
+#endif
+                uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+                // 4. pack the bytes
+                const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+                const uint8x16_t shuffle = vld1q_u8(row + 1);
+                const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
 
-            // 5. store bytes
-            vst1q_u8(utf8_output, utf8_packed);
+                // 5. store bytes
+                vst1q_u8(utf8_output, utf8_packed);
 
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
+                // 6. adjust pointers
+                buf += 8;
+                utf8_output += row[0];
+                continue;
 
-      } else {
-        // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
-
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-  #else
-          const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-  #endif
-          /* In this branch we handle three cases:
-            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-            We expand the input word (16-bit) into two words (32-bit), thus
-            we have room for four bytes. However, we need five distinct bit
-            layouts. Note that the last byte in cases #2 and #3 is the same.
-
-            We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-            in register t2.
-
-            We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-            either byte 1 for case #2 or byte 2 for case #3. Note that they
-            differ by exactly one bit.
-
-            Finally from these two words we build proper UTF-8 sequence, taking
-            into account the case (i.e, the number of bytes to write).
-          */
-          /**
-           * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-           * t2 => [0ccc|cccc] [10cc|cccc]
-           * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-           */
-  #define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-          // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-          const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
-          // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-          const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-          // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-          const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-          // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-          const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-          // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-          const uint16x8_t s1 = vandq_u16(utf16_packed, vec(0b0000111111000000));
-          // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-          const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-          // [00bb|bbbb|0000|aaaa]
-          const uint16x8_t s2 = vorrq_u16(s0, s1s);
-          // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-          const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-          const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-          const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
-          const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-          const uint16x8_t s4 = veorq_u16(s3, m0);
-  #undef vec
-
-          // 4. expand words 16-bit => 32-bit
-          const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-          const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-          // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 );
-          const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 );
-  #else
-          const uint16x8_t onemask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 };
-          const uint16x8_t twomask = { 0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 };
-  #endif
-          const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-          const uint16_t mask = vaddvq_u16(combined);
-          // The following fast path may or may not be beneficial.
-          /*if(mask == 0) {
-            // We only have three-byte words. Use fast path.
-            const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-            vst1q_u8(utf8_output, utf8_0);
-            utf8_output += 12;
-            vst1q_u8(utf8_output, utf8_1);
-            utf8_output += 12;
-            buf += 8;
-            continue;
-          }*/
-          const uint8_t mask0 = uint8_t(mask);
-
-          const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-          const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-          const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-          const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-          const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += row0[0];
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += row1[0];
-
-          buf += 8;
-      }
-    // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000)==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            } else {
+                // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+                const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+                const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+                forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+                const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+#endif
+                /* In this branch we handle three cases:
+                  1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                  2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                  3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+                  We expand the input word (16-bit) into two words (32-bit), thus
+                  we have room for four bytes. However, we need five distinct bit
+                  layouts. Note that the last byte in cases #2 and #3 is the same.
+
+                  We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+                  in register t2.
+
+                  We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+                  either byte 1 for case #2 or byte 2 for case #3. Note that they
+                  differ by exactly one bit.
+
+                  Finally from these two words we build proper UTF-8 sequence, taking
+                  into account the case (i.e, the number of bytes to write).
+                */
+                /**
+                 * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+                 * t2 => [0ccc|cccc] [10cc|cccc]
+                 * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+                 */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+                // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+                const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
+                // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+                const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+                // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+                const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+                // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+                const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+                // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+                const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+                // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+                const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+                // [00bb|bbbb|0000|aaaa]
+                const uint16x8_t s2 = vorrq_u16(s0, s1s);
+                // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+                const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+                const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+                const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
+                const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+                const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+                // 4. expand words 16-bit => 32-bit
+                const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+                const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+                // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000);
+                const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000);
+#else
+                const uint16x8_t onemask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000 };
+                const uint16x8_t twomask = { 0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000 };
+#endif
+                const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+                const uint16_t mask = vaddvq_u16(combined);
+                // The following fast path may or may not be beneficial.
+                /*if(mask == 0) {
+                  // We only have three-byte words. Use fast path.
+                  const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+                  const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+                  const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+                  vst1q_u8(utf8_output, utf8_0);
+                  utf8_output += 12;
+                  vst1q_u8(utf8_output, utf8_1);
+                  utf8_output += 12;
+                  buf += 8;
+                  continue;
+                }*/
+                const uint8_t mask0 = uint8_t(mask);
+                const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+                const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+                const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+                const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+                const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+                const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+                const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+                vst1q_u8(utf8_output, utf8_0);
+                utf8_output += row0[0];
+                vst1q_u8(utf8_output, utf8_1);
+                utf8_output += row1[0];
+
+                buf += 8;
+            }
+            // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
         } else {
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  if (vmaxvq_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
-  }
-  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
-}
-
-
-std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
-
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  while (buf + 16 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          continue; // we are done for this round!
-      }
-
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    // check for invalid input
+    if (vmaxvq_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+    }
+    return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+}
+
+std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
+
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    while (buf + 16 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+        uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t*>(buf + 4));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+            // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+            // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+            uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+            if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                continue; // we are done for this round!
+            }
 
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const uint16x8_t t3 = vorrq_u16(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-            // 2. merge ASCII and 2-byte codewords
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-            // 3. prepare bitmask for 8-bit lookup
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080);
-  #else
-            const uint16x8_t mask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080 };
-  #endif
-            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const uint8x16_t shuffle = vld1q_u8(row + 1);
-            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+            if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+                // 1. prepare 2-byte values
+                // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+                // expected output   : [110a|aaaa|10bb|bbbb] x 8
+                const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+                const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+                // t0 = [000a|aaaa|bbbb|bb00]
+                const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+                // t1 = [000a|aaaa|0000|0000]
+                const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+                // t2 = [0000|0000|00bb|bbbb]
+                const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+                // t3 = [000a|aaaa|00bb|bbbb]
+                const uint16x8_t t3 = vorrq_u16(t1, t2);
+                // t4 = [110a|aaaa|10bb|bbbb]
+                const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+                // 2. merge ASCII and 2-byte codewords
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+                const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+                // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080);
+#else
+                const uint16x8_t mask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080 };
+#endif
+                uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+                // 4. pack the bytes
+                const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+                const uint8x16_t shuffle = vld1q_u8(row + 1);
+                const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
 
-            // 5. store bytes
-            vst1q_u8(utf8_output, utf8_packed);
+                // 5. store bytes
+                vst1q_u8(utf8_output, utf8_packed);
 
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
+                // 6. adjust pointers
+                buf += 8;
+                utf8_output += row[0];
+                continue;
 
-      } else {
-        // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-        // check for invalid input
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
-        if (vmaxvq_u16(forbidden_bytemask) != 0) {
-          return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
-        }
-
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-  #else
-          const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-  #endif
-          /* In this branch we handle three cases:
-            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-            We expand the input word (16-bit) into two words (32-bit), thus
-            we have room for four bytes. However, we need five distinct bit
-            layouts. Note that the last byte in cases #2 and #3 is the same.
-
-            We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-            in register t2.
-
-            We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-            either byte 1 for case #2 or byte 2 for case #3. Note that they
-            differ by exactly one bit.
-
-            Finally from these two words we build proper UTF-8 sequence, taking
-            into account the case (i.e, the number of bytes to write).
-          */
-          /**
-           * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-           * t2 => [0ccc|cccc] [10cc|cccc]
-           * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-           */
-  #define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-          // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-          const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
-          // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-          const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-          // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-          const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-          // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-          const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-          // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-          const uint16x8_t s1 = vandq_u16(utf16_packed, vec(0b0000111111000000));
-          // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-          const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-          // [00bb|bbbb|0000|aaaa]
-          const uint16x8_t s2 = vorrq_u16(s0, s1s);
-          // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-          const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-          const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-          const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
-          const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-          const uint16x8_t s4 = veorq_u16(s3, m0);
-  #undef vec
-
-          // 4. expand words 16-bit => 32-bit
-          const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-          const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-          // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 );
-          const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 );
-  #else
-          const uint16x8_t onemask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 };
-          const uint16x8_t twomask = { 0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 };
-  #endif
-          const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-          const uint16_t mask = vaddvq_u16(combined);
-          // The following fast path may or may not be beneficial.
-          /*if(mask == 0) {
-            // We only have three-byte words. Use fast path.
-            const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-            vst1q_u8(utf8_output, utf8_0);
-            utf8_output += 12;
-            vst1q_u8(utf8_output, utf8_1);
-            utf8_output += 12;
-            buf += 8;
-            continue;
-          }*/
-          const uint8_t mask0 = uint8_t(mask);
-
-          const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-          const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-          const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-          const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-          const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += row0[0];
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += row1[0];
-
-          buf += 8;
-      }
-    // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000)==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            } else {
+                // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+                // check for invalid input
+                const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+                const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+                const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
+                if (vmaxvq_u16(forbidden_bytemask) != 0) {
+                    return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
+                }
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+                const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+#endif
+                /* In this branch we handle three cases:
+                  1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                  2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                  3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+                  We expand the input word (16-bit) into two words (32-bit), thus
+                  we have room for four bytes. However, we need five distinct bit
+                  layouts. Note that the last byte in cases #2 and #3 is the same.
+
+                  We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+                  in register t2.
+
+                  We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+                  either byte 1 for case #2 or byte 2 for case #3. Note that they
+                  differ by exactly one bit.
+
+                  Finally from these two words we build proper UTF-8 sequence, taking
+                  into account the case (i.e, the number of bytes to write).
+                */
+                /**
+                 * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+                 * t2 => [0ccc|cccc] [10cc|cccc]
+                 * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+                 */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+                // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+                const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
+                // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+                const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+                // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+                const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+                // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+                const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+                // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+                const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+                // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+                const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+                // [00bb|bbbb|0000|aaaa]
+                const uint16x8_t s2 = vorrq_u16(s0, s1s);
+                // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+                const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+                const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+                const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
+                const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+                const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+                // 4. expand words 16-bit => 32-bit
+                const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+                const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+                // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000);
+                const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000);
+#else
+                const uint16x8_t onemask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000 };
+                const uint16x8_t twomask = { 0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000 };
+#endif
+                const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+                const uint16_t mask = vaddvq_u16(combined);
+                // The following fast path may or may not be beneficial.
+                /*if(mask == 0) {
+                  // We only have three-byte words. Use fast path.
+                  const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+                  const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+                  const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+                  vst1q_u8(utf8_output, utf8_0);
+                  utf8_output += 12;
+                  vst1q_u8(utf8_output, utf8_1);
+                  utf8_output += 12;
+                  buf += 8;
+                  continue;
+                }*/
+                const uint8_t mask0 = uint8_t(mask);
+
+                const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+                const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+                const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+                const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+                const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+                const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+                const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+                vst1q_u8(utf8_output, utf8_0);
+                utf8_output += row0[0];
+                vst1q_u8(utf8_output, utf8_1);
+                utf8_output += row1[0];
+
+                buf += 8;
+            }
+            // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
         } else {
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
 /* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) {
-  uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
-  const char32_t* end = buf + len;
-
-  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
-
-  while(buf + 4 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
-
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
-
-      if (!match_system(big_endian)) {
-        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
-        #else
-        const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
-        #endif
-        utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
-    } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
-          *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out)
+{
+    uint16_t* utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
+    const char32_t* end = buf + len;
+
+    uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
+
+    while (buf + 4 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(in) <= 0xFFFF) {
+            uint16x4_t utf16_packed = vmovn_u32(in);
+
+            const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+            const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+            forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
+
+            if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
+#else
+                const uint8x8_t swap = { 1, 0, 3, 2, 5, 4, 7, 6 };
+#endif
+                utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
+            }
+            vst1_u16(utf16_output, utf16_packed);
+            utf16_output += 4;
+            buf += 4;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
-    }
-  }
-
-  // check for invalid input
-  if (vmaxv_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
-  }
-
-  return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
-}
-
-
-template <endianness big_endian>
-std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) {
-  uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
-
-  while(buf + 4 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
-
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
-      if (vmaxv_u16(forbidden_bytemask) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
-      }
-
-      if (!match_system(big_endian)) {
-        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
-        #else
-        const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
-        #endif
-        utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
-    } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
-          *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+            size_t forward = 3;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (!match_system(big_endian)) {
+                        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+                        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
+        }
+    }
+
+    // check for invalid input
+    if (vmaxv_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+    }
+
+    return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
+}
+
+template<endianness big_endian>
+std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out)
+{
+    uint16_t* utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
+
+    while (buf + 4 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(in) <= 0xFFFF) {
+            uint16x4_t utf16_packed = vmovn_u32(in);
+
+            const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+            const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+            const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
+            if (vmaxv_u16(forbidden_bytemask) != 0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
+            }
+
+            if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
+#else
+                const uint8x8_t swap = { 1, 0, 3, 2, 5, 4, 7, 6 };
+#endif
+                utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
+            }
+            vst1_u16(utf16_output, utf16_packed);
+            utf16_output += 4;
+            buf += 4;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 3;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (!match_system(big_endian)) {
+                        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+                        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace arm64 {
@@ -13769,92 +16026,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace arm64 {
@@ -13863,21 +16138,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -13885,101 +16161,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -13990,51 +16257,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -14043,7 +16313,7 @@ using utf8_validation::utf8_checker;
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace arm64 {
@@ -14054,15 +16324,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -14071,97 +16342,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -14170,10 +16450,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
@@ -14181,63 +16460,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -14245,32 +16525,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -14278,258 +16557,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -14539,68 +16841,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -14608,251 +16908,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -14862,36 +17184,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -14899,64 +17222,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -14964,739 +17295,1106 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf16.h */
+
+// placeholder scalars
+
 //
 // Implementation-specific overrides
 //
 namespace simdutf {
 namespace arm64 {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
     } else {
-      return simdutf::encoding_type::unspecified;
+        if (implementation::validate_utf8(input, length)) {
+            return simdutf::encoding_type::UTF8;
+        } else {
+            return simdutf::encoding_type::unspecified;
+        }
     }
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8(buf,len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii(buf,len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = arm_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = arm_validate_utf32le(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  result res = arm_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    result res = arm_validate_utf32le_with_errors(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    char32_t* utf32_output) const noexcept
+{
+    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
     }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
     }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
-  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
-    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
-    const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
 
-    const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
-    const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
-    const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
 
-    const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
-    const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
 
-    size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
-    size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
-    size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
 
-    count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
-  }
-  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
-    const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
-    const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
-    size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
-    count += 4 + surrogate_count;
-  }
-  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf32_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
+    const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
+    const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+    const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input + pos));
+        const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
+        const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
+        const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
+        const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+
+        const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
+        const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
+        const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+
+        const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
+        const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+
+        size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
+        size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
+        size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+
+        count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    }
+    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+    const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input + pos));
+        const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
+        const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
+        const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
+        size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
+        count += 4 + surrogate_count;
+    }
+    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf32_length_from_utf8(input, length);
 }
 
 } // namespace arm64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/end.h
 /* begin file src/simdutf/arm64/end.h */
 /* end file src/simdutf/arm64/end.h */
 /* end file src/arm64/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=fallback/implementation.cpp
 /* begin file src/fallback/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/begin.h
 /* begin file src/simdutf/fallback/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
 // #define SIMDUTF_IMPLEMENTATION fallback
 /* end file src/simdutf/fallback/begin.h */
 
-
-
-
-
-
-
-
 namespace simdutf {
 namespace fallback {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  int out = 0;
-  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
-  if((length % 2) == 0) {
-    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
-  }
-  if((length % 4) == 0) {
-    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
-  }
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    int out = 0;
+    if (validate_utf8(input, length)) {
+        out |= encoding_type::UTF8;
+    }
+    if ((length % 2) == 0) {
+        if (validate_utf16le(reinterpret_cast<const char16_t*>(input), length / 2)) {
+            out |= encoding_type::UTF16_LE;
+        }
+    }
+    if ((length % 4) == 0) {
+        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
+            out |= encoding_type::UTF32_LE;
+        }
+    }
 
-  return out;
+    return out;
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
     return scalar::utf8::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
     return scalar::utf8::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
     return scalar::ascii::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
     return scalar::ascii::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
     return scalar::utf32::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
     return scalar::utf32::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-   return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-   return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert_valid(input, size,  utf32_output);
+    char32_t* utf32_output) const noexcept
+{
+    return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    scalar::utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf16_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace fallback
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/end.h
 /* begin file src/simdutf/fallback/end.h */
 /* end file src/simdutf/fallback/end.h */
 /* end file src/fallback/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/implementation.cpp
 /* begin file src/icelake/implementation.cpp */
 
-
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/begin.h
 /* begin file src/simdutf/icelake/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
 // #define SIMDUTF_IMPLEMENTATION icelake
@@ -15708,7 +18406,7 @@ SIMDUTF_TARGET_ICELAKE
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
@@ -15717,10 +18415,11 @@ namespace {
 #ifndef SIMDUTF_ICELAKE_H
 #error "icelake.h must be included"
 #endif
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
 /* begin file src/icelake/icelake_utf8_common.inl.cpp */
 // Common procedures for both validating and non-validating conversions from UTF-8.
-enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL};
+enum block_processing_mode { SIMDUTF_FULL,
+    SIMDUTF_TAIL };
 
 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
 using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
@@ -15736,302 +18435,329 @@ using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
     The provided in and out pointers are advanced according to how many input
     bytes have been processed, upon success.
 */
-template <block_processing_mode tail, endianness big_endian>
-simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
-  // constants
-  __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
-  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
-  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
-  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
-  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
-  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
-  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
-  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
-  __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  // Note that 'tail' is a compile-time constant !
-  __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
-  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
-  __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
-  if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII
-  // alternatively, we could do 'if (m1 == b) { '
+template<block_processing_mode tail, endianness big_endian>
+simdutf_really_inline bool process_block_utf8_to_utf16(const char*& in, char16_t*& out, size_t gap)
+{
+    // constants
+    __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
+    __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
+    __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
+    __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
+    __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
+    __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
+    __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
+    __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
+    __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    // Note that 'tail' is a compile-time constant !
+    __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
+    __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
+    __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
+    if (_ktestc_mask64_u8(m1, b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
+        // alternatively, we could do 'if (m1 == b) { '
+        if (tail == SIMDUTF_FULL) {
+            in += 64; // consumed 64 bytes
+            // we convert a full 64-byte block, writing 128 bytes.
+            __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+            if (big_endian) {
+                input1 = _mm512_shuffle_epi8(input1, byteflip);
+            }
+            _mm512_storeu_si512(out, input1);
+            out += 32;
+            __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+            if (big_endian) {
+                input2 = _mm512_shuffle_epi8(input2, byteflip);
+            }
+            _mm512_storeu_si512(out, input2);
+            out += 32;
+            return true; // we are done
+        } else {
+            in += gap;
+            if (gap <= 32) {
+                __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+                if (big_endian) {
+                    input1 = _mm512_shuffle_epi8(input1, byteflip);
+                }
+                _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
+                out += gap;
+            } else {
+                __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+                if (big_endian) {
+                    input1 = _mm512_shuffle_epi8(input1, byteflip);
+                }
+                _mm512_storeu_si512(out, input1);
+                out += 32;
+                __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+                if (big_endian) {
+                    input2 = _mm512_shuffle_epi8(input2, byteflip);
+                }
+                _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
+                out += gap - 32;
+            }
+            return true; // we are done
+        }
+    }
+    // classify characters further
+    __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
+        _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
+    __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
+        _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
+
+    __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
+        _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
+                        // Overlong 2-byte sequence
+    if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
+        // Overlong 2-byte sequence
+        return false;
+    }
+    if (_ktestz_mask64_u8(m34, m34) == 0) {
+        // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
+        __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
+            _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
+
+        __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
+
+        __mmask64 mp1 = _kshiftli_mask64(m234, 1);
+        __mmask64 mp2 = _kshiftli_mask64(m34, 2);
+        // We could do it as follows...
+        // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
+        // but GCC generates better code when we do:
+        if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
+            // Fast path with 1,2,3 bytes
+            __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
+            __mmask64 m1234 = _kor_mask64(m1, m234);
+            // mismatched continuation bytes:
+            if (tail == SIMDUTF_FULL) {
+                __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+                // the presence of a 1 bit indicates that they overlap.
+                // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
+                if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+                    return false;
+                }
+            } else {
+                __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+                if (mc != bxorm1234) {
+                    return false;
+                }
+            }
+            // mend: identifying the last bytes of each sequence to be decoded
+            __mmask64 mend = _kshiftri_mask64(m1234, 1);
+            if (tail != SIMDUTF_FULL) {
+                mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
+            }
+
+            __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+            __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+            __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+            __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
+            __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
+                clearedbytes); // the last byte of each character
+
+            __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+            __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+            __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+            __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
+                beforeasciibytes); // the second last bytes (of two, three byte seq,
+                                   // surrogates)
+            secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+
+            __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
+                indexofsecondlastbytes); // indices of the second last bytes
+            __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
+                clearedbytes); // only those that are the third last byte of a sequece
+            __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
+                thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                                // surrogate)
+            thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+            __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
+            // the elements of Wout excluding the last element if it happens to be a high surrogate:
+
+            __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
+
+            // Encodings out of range...
+            {
+                // the location of 3-byte sequence start bytes in the input
+                __mmask64 m3 = m34 & (b ^ m4);
+                // words in Wout corresponding to 3-byte sequences.
+                __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+                __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+                __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+                __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+                __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+                __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+                if (_kor_mask32(Msmall800, M3s)) {
+                    return false;
+                }
+            }
+            int64_t nout = _mm_popcnt_u64(mprocessed);
+            in += 64 - _lzcnt_u64(mprocessed);
+            if (big_endian) {
+                Wout = _mm512_shuffle_epi8(Wout, byteflip);
+            }
+            _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+            out += nout;
+            return true; // ok
+        }
+        //
+        // We have a 4-byte sequence, this is the general case.
+        // Slow!
+        __mmask64 mp3 = _kshiftli_mask64(m4, 3);
+        __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
+        __mmask64 m1234 = _kor_mask64(m1, m234);
+
+        // mend: identifying the last bytes of each sequence to be decoded
+        __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
+        if (tail != SIMDUTF_FULL) {
+            mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
+        }
+        __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+        __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+        __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+        __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
+        __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
+            clearedbytes); // the last byte of each character
+
+        __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+        __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+        __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+        __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
+            beforeasciibytes); // the second last bytes (of two, three byte seq,
+                               // surrogates)
+        secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+
+        __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
+            indexofsecondlastbytes); // indices of the second last bytes
+        __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
+            clearedbytes); // only those that are the third last byte of a sequece
+        __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
+            thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                            // surrogate)
+        thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+        __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
+        uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
+        __mmask32 Mlo = __mmask32(Mlo_uint64);
+        __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
+        __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
+            mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
+        __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
+            4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
+        __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
+            lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
+        __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
+            mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
+        // the elements of Wout excluding the last element if it happens to be a high surrogate:
+        __mmask32 Mout = ~(Mhi & 0x80000000);
+        __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
+
+        // mismatched continuation bytes:
+        if (tail == SIMDUTF_FULL) {
+            __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+            // the presence of a 1 bit indicates that they overlap.
+            // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
+            if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+                return false;
+            }
+        } else {
+            __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+            if (mc != bxorm1234) {
+                return false;
+            }
+        }
+        // Encodings out of range...
+        {
+            // the location of 3-byte sequence start bytes in the input
+            __mmask64 m3 = m34 & (b ^ m4);
+            // words in Wout corresponding to 3-byte sequences.
+            __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+            __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+            __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+            __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+            __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+            __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+            __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
+            __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
+            if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
+                return false;
+            }
+        }
+        in += 64 - _lzcnt_u64(mprocessed);
+        int64_t nout = _mm_popcnt_u64(mprocessed);
+        if (big_endian) {
+            Wout = _mm512_shuffle_epi8(Wout, byteflip);
+        }
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+        out += nout;
+        return true; // ok
+    }
+    // Fast path 2: all ASCII or 2 byte
+    __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
+    // on top of -0xc0 we substract -2 which we get back later of the
+    // continuation byte tags
+    __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
+    __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
     if (tail == SIMDUTF_FULL) {
-      in += 64;          // consumed 64 bytes
-      // we convert a full 64-byte block, writing 128 bytes.
-      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-      if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
-      _mm512_storeu_si512(out, input1);
-      out += 32;
-      __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-      if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
-      _mm512_storeu_si512(out, input2);
-      out += 32;
-      return true; // we are done
+        __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
+        if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
+            return false;
+        }
     } else {
-      in += gap;
-      if (gap <= 32) {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
-        out += gap;
-      } else {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
-        _mm512_storeu_si512(out, input1);
-        out += 32;
-        __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-        if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
-        out += gap - 32;
-      }
-      return true; // we are done
-    }
-  }
-  // classify characters further
-  __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
-                                        _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
-  __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
-                                       _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
-
-  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
-                                                     _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
-                                                                     // Overlong 2-byte sequence
-  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
-    // Overlong 2-byte sequence
-    return false;
-  }
-  if (_ktestz_mask64_u8(m34, m34) == 0) {
-    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
-    __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
-                                        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
-
-    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
-
-    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
-    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
-    // We could do it as follows...
-    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
-    // but GCC generates better code when we do:
-    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
-      // Fast path with 1,2,3 bytes
-      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
-      __mmask64 m1234 = _kor_mask64(m1, m234);
-      // mismatched continuation bytes:
-      if (tail == SIMDUTF_FULL) {
-        __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-        // the presence of a 1 bit indicates that they overlap.
-        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
-        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
-      } else {
-        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-        if (mc != bxorm1234) { return false; }
-      }
-      // mend: identifying the last bytes of each sequence to be decoded
-      __mmask64 mend = _kshiftri_mask64(m1234, 1);
-      if (tail != SIMDUTF_FULL) {
-        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
-      }
-
-
-      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-      __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-      __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-      __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
-      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
-                                                        clearedbytes); // the last byte of each character
-
-      __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
-      __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-      __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
-                                                              beforeasciibytes); // the second last bytes (of two, three byte seq,
-                                                                                 // surrogates)
-      secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
-
-      __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
-                                                       indexofsecondlastbytes); // indices of the second last bytes
-      __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
-                                                    clearedbytes); // only those that are the third last byte of a sequece
-      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
-                                                             thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                                                                             // surrogate)
-      thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
-      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
-      // the elements of Wout excluding the last element if it happens to be a high surrogate:
-
-      __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
-
-
-      // Encodings out of range...
-      {
-        // the location of 3-byte sequence start bytes in the input
-        __mmask64 m3 = m34 & (b ^ m4);
-        // words in Wout corresponding to 3-byte sequences.
-        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-        __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-        __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-        if (_kor_mask32(Msmall800, M3s)) { return false; }
-      }
-      int64_t nout = _mm_popcnt_u64(mprocessed);
-      in +=  64 - _lzcnt_u64(mprocessed);
-      if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
-      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-      out += nout;
-      return true; // ok
+        __mmask64 bxorleading = _kxor_mask64(b, leading);
+        if (_kshiftli_mask64(m234, 1) != bxorleading) {
+            return false;
+        }
     }
     //
-    // We have a 4-byte sequence, this is the general case.
-    // Slow!
-    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
-    __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
-    __mmask64 m1234 = _kor_mask64(m1, m234);
-
-    // mend: identifying the last bytes of each sequence to be decoded
-    __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
-    if (tail != SIMDUTF_FULL) {
-      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
-    }
-    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-    __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-    __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-    __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
-    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
-                                                      clearedbytes); // the last byte of each character
-
-    __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
-    __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-    __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
-                                                            beforeasciibytes); // the second last bytes (of two, three byte seq,
-                                                                               // surrogates)
-    secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
-
-    __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
-                                                     indexofsecondlastbytes); // indices of the second last bytes
-    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
-                                                  clearedbytes); // only those that are the third last byte of a sequece
-    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
-                                                           thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                                                                           // surrogate)
-    thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
-    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
-    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
-    __mmask32 Mlo = __mmask32(Mlo_uint64);
-    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
-    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
-                                                  mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
-    __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
-                                                                 4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
-    __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
-                                                   lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
-    __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
-                                         mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
-    // the elements of Wout excluding the last element if it happens to be a high surrogate:
-    __mmask32 Mout = ~(Mhi & 0x80000000);
-    __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
-
-
-    // mismatched continuation bytes:
     if (tail == SIMDUTF_FULL) {
-      __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-      // the presence of a 1 bit indicates that they overlap.
-      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
-      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
+        // In the two-byte/ASCII scenario, we are easily latency bound, so we want
+        // to increment the input buffer as quickly as possible.
+        // We process 32 bytes unless the byte at index 32 is a continuation byte,
+        // in which case we include it as well for a total of 33 bytes.
+        // Note that if x is an ASCII byte, then the following is false:
+        // int8_t(x) <= int8_t(0xc0) under two's complement.
+        in += 32;
+        if (int8_t(*in) <= int8_t(0xc0))
+            in++;
+        // The alternative is to do
+        // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+        // but it requires loading the input, doing the mask computation, and converting
+        // back the mask to a general register. It just takes too long, leaving the
+        // processor likely to be idle.
     } else {
-      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-      if (mc != bxorm1234) { return false; }
-    }
-    // Encodings out of range...
-    {
-      // the location of 3-byte sequence start bytes in the input
-      __mmask64 m3 = m34 & (b ^ m4);
-      // words in Wout corresponding to 3-byte sequences.
-      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-      __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-      __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
-      __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
-      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; }
-    }
-    in += 64 - _lzcnt_u64(mprocessed);
-    int64_t nout = _mm_popcnt_u64(mprocessed);
-    if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-    out += nout;
-    return true; // ok
-  }
-  // Fast path 2: all ASCII or 2 byte
-  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
-  // on top of -0xc0 we substract -2 which we get back later of the
-  // continuation byte tags
-  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
-  __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
-  if (tail == SIMDUTF_FULL) {
-    __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
-    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; }
-  } else {
-    __mmask64 bxorleading = _kxor_mask64(b, leading);
-    if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; }
-  }
-  //
-  if (tail == SIMDUTF_FULL) {
-    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
-    // to increment the input buffer as quickly as possible.
-    // We process 32 bytes unless the byte at index 32 is a continuation byte,
-    // in which case we include it as well for a total of 33 bytes.
-    // Note that if x is an ASCII byte, then the following is false:
-    // int8_t(x) <= int8_t(0xc0) under two's complement.
-    in += 32;
-    if(int8_t(*in) <= int8_t(0xc0)) in++;
-    // The alternative is to do
-    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-    // but it requires loading the input, doing the mask computation, and converting
-    // back the mask to a general register. It just takes too long, leaving the
-    // processor likely to be idle.
-  } else {
-    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-  }
-  __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte);          // will contain zero for ascii, and the data
-  lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead));                 // ... zero extended into words
-  __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
-  follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow));             // ... zero extended into words
-  lead = _mm512_slli_epi16(lead, 6);                                         // shifted into position
-  __m512i final = _mm512_add_epi16(follow, lead);                            // combining lead and follow
-
-  if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); }
-  if (tail == SIMDUTF_FULL) {
-    // Next part is UTF-16 specific and can be generalized to UTF-32.
-    int nout = _mm_popcnt_u32(uint32_t(leading));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-  } else {
-    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-  }
-
-  return true; // we are fine.
-}
-
+        in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+    }
+    __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data
+    lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into words
+    __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
+    follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into words
+    lead = _mm512_slli_epi16(lead, 6); // shifted into position
+    __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
 
+    if (big_endian) {
+        final = _mm512_shuffle_epi8(final, byteflip);
+    }
+    if (tail == SIMDUTF_FULL) {
+        // Next part is UTF-16 specific and can be generalized to UTF-32.
+        int nout = _mm_popcnt_u32(uint32_t(leading));
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+        out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+    } else {
+        int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+        out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+    }
 
+    return true; // we are fine.
+}
 
 /*
     utf32_to_utf16_masked converts `count` lower UTF-32 words
@@ -16054,8 +18780,9 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t
     We pass it to the (always inlined) function to encourage the compiler to
     keep the value in a (constant) register.
 */
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output)
+{
 
     const __mmask16 valid = uint16_t((1 << count) - 1);
     // 1. check if we have any surrogate pairs
@@ -16063,11 +18790,11 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
     const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
 
     if (sp_mask == 0) {
-        if(big_endian) {
-          _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
+        if (big_endian) {
+            _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
 
         } else {
-          _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
+            _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
         }
         return count;
     }
@@ -16097,12 +18824,14 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
         // Here we want to trim all of the upper 16-bit words from the 2-byte
         // characters represented as 4-byte values. We can compute it from
         // sp_mask or the following... It can be more optimized!
-        const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-        const  __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1));
-        if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
+        const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+        const __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
+        if (big_endian) {
+            t5 = _mm512_shuffle_epi8(t5, byteflip);
+        }
         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
         __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
-        _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
+        _mm512_mask_storeu_epi16(output, (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
         //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
     }
 
@@ -16129,18 +18858,19 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
     We pass it to the (always inlined) function to encourage the compiler to
     keep the value in a (constant) register.
 */
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output)
+{
     // check if we have any surrogate pairs
     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
     const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
     if (sp_mask == 0) {
         // technically, it should be _mm256_storeu_epi16
-        if(big_endian) {
-          _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
+        if (big_endian) {
+            _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
         } else {
-          _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
+            _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
         }
         return count;
     }
@@ -16167,11 +18897,13 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3
         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
         __m512i t5 = _mm512_ror_epi32(t4, 16);
-        const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-        if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
+        const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+        if (big_endian) {
+            t5 = _mm512_shuffle_epi8(t5, byteflip);
+        }
         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
         __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
-        _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
+        _mm512_mask_storeu_epi16(output, (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
         //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
     }
 
@@ -16181,21 +18913,23 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3
 /**
  * Store the last N bytes of previous followed by 512-N bytes from input.
  */
-template <int N>
-__m512i prev(__m512i input, __m512i previous) {
-    static_assert(N<=32, "N must be no larger than 32");
-    const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11);
+template<int N>
+__m512i prev(__m512i input, __m512i previous)
+{
+    static_assert(N <= 32, "N must be no larger than 32");
+    const __m512i movemask = _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
     const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
 #if SIMDUTF_GCC8 || SIMDUTF_GCC9
-    constexpr int shift = 16-N; // workaround for GCC8,9
+    constexpr int shift = 16 - N; // workaround for GCC8,9
     return _mm512_alignr_epi8(input, rotated, shift);
 #else
-    return _mm512_alignr_epi8(input, rotated, 16-N);
+    return _mm512_alignr_epi8(input, rotated, 16 - N);
 #endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
 }
 
-template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
-__m512i shuffle_epi128(__m512i v) {
+template<unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
+__m512i shuffle_epi128(__m512i v)
+{
     static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
     static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
     static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
@@ -16205,16 +18939,18 @@ __m512i shuffle_epi128(__m512i v) {
     return _mm512_shuffle_i32x4(v, v, shuffle);
 }
 
-template <unsigned idx>
-constexpr __m512i broadcast_epi128(__m512i v) {
+template<unsigned idx>
+constexpr __m512i broadcast_epi128(__m512i v)
+{
     return shuffle_epi128<idx, idx, idx, idx>(v);
 }
 
 /**
  * Current unused.
  */
-template <int N>
-__m512i rotate_by_N_epi8(const __m512i input) {
+template<int N>
+__m512i rotate_by_N_epi8(const __m512i input)
+{
 
     // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
     const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
@@ -16230,7 +18966,8 @@ __m512i rotate_by_N_epi8(const __m512i input) {
     0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
     corresponding bytes during pshufb.
 */
-simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
+simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8)
+{
     /*
         Input:
         - utf8: bytes stored at separate 32-bit words
@@ -16319,8 +19056,7 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
             0x0707070707070707,
             0x0b0a090900000000,
             0x0707070707070707,
-            0x0b0a090900000000
-        );
+            0x0b0a090900000000);
 
         const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
         values = _mm512_sllv_epi32(values, shift);
@@ -16341,8 +19077,7 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
             0x1919191919191919,
             0x0b10151500000000,
             0x1919191919191919,
-            0x0b10151500000000
-        );
+            0x0b10151500000000);
 
         const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
         values = _mm512_srlv_epi32(values, shift);
@@ -16351,29 +19086,29 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
     return values;
 }
 
-
-simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) {
+simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int& count)
+{
     const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
     const __m512i expand_ver2 = _mm512_setr_epi64(
-                0x0403020103020100,
-                0x0605040305040302,
-                0x0807060507060504,
-                0x0a09080709080706,
-                0x0c0b0a090b0a0908,
-                0x0e0d0c0b0d0c0b0a,
-                0x000f0e0d0f0e0d0c,
-                0x0201000f01000f0e
-    );
+        0x0403020103020100,
+        0x0605040305040302,
+        0x0807060507060504,
+        0x0a09080709080706,
+        0x0c0b0a090b0a0908,
+        0x0e0d0c0b0d0c0b0a,
+        0x000f0e0d0f0e0d0c,
+        0x0201000f01000f0e);
     const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
     const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
     const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
     const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
     const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
     count = static_cast<int>(count_ones(leading_bytes));
-    return  _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
+    return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
 }
 
-simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
+simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input)
+{
     __m512i char_class = _mm512_srli_epi32(input, 4);
     /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
     const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
@@ -16382,7 +19117,7 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
     return expanded_utf8_to_utf32(char_class, input);
 }
 /* end file src/icelake/icelake_utf8_common.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_macros.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_macros.inl.cpp
 /* begin file src/icelake/icelake_macros.inl.cpp */
 
 /*
@@ -16426,99 +19161,97 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
         ]
 */
 
-#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                    \
-        {                                                                                                    \
-            const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                              \
-            const __m512i expand_ver2 = _mm512_setr_epi64(                                                   \
-                0x0403020103020100,                                                                          \
-                0x0605040305040302,                                                                          \
-                0x0807060507060504,                                                                          \
-                0x0a09080709080706,                                                                          \
-                0x0c0b0a090b0a0908,                                                                          \
-                0x0e0d0c0b0d0c0b0a,                                                                          \
-                0x000f0e0d0f0e0d0c,                                                                          \
-                0x0201000f01000f0e                                                                           \
-            );                                                                                               \
-            const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                  \
-                                                                                                             \
-            __mmask16 leading_bytes;                                                                         \
-            const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                             \
-            const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                         \
-            const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                             \
-            leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                       \
-                                                                                                             \
-            __m512i char_class;                                                                              \
-            char_class = _mm512_srli_epi32(input, 4);                                                        \
-            /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                           \
-            const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                             \
-            const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                       \
-            char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);              \
-                                                                                                             \
-            const int valid_count = static_cast<int>(count_ones(leading_bytes));                             \
-            const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                 \
-                                                                                                             \
-            const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);    \
-                                                                                                             \
-            if (UTF32) {                                                                                     \
-                if(MASKED) {                                                                                 \
-                    const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                \
-                    _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                  \
-                } else {                                                                                     \
-                    _mm512_storeu_si512((__m512i*)output, out);                                              \
-                }                                                                                            \
-                output += valid_count;                                                                       \
-            } else {                                                                                         \
-                if(MASKED) {                                                                                 \
-                    output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
-                } else {                                                                                     \
-                    output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output));        \
-                }                                                                                            \
-            }                                                                                                \
-        }
-
-#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                    \
-{                                                                                                           \
-    if (UTF32) {                                                                                            \
-        if(MASKED) {                                                                                        \
-            const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                  \
-            _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                  \
-        } else {                                                                                            \
-            _mm512_storeu_si512((__m512i*)output, INPUT);                                              \
-        }                                                                                                   \
-        output += VALID_COUNT;                                                                              \
-    } else {                                                                                                \
-        if(MASKED) {                                                                                        \
-            output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));      \
-        } else {                                                                                            \
-            output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));             \
-        }                                                                                                   \
-    }                                                                                                       \
-}
-
-
-#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                  \
-        if (UTF32) {                                                                      \
-                const __m128i t0 = _mm512_castsi512_si128(utf8);                          \
-                const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                    \
-                const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                    \
-                const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                    \
-                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \
-                _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \
-                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \
-                _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \
-        } else {                                                                          \
-                const __m256i h0 = _mm512_castsi512_si256(utf8);                          \
-                const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                    \
-                if(big_endian) {                                                          \
-                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
-                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
-                } else {                                                                  \
-                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \
-                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \
-                }                                                                         \
-        }
+#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                                     \
+    {                                                                                                                         \
+        const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                                                   \
+        const __m512i expand_ver2 = _mm512_setr_epi64(                                                                        \
+            0x0403020103020100,                                                                                               \
+            0x0605040305040302,                                                                                               \
+            0x0807060507060504,                                                                                               \
+            0x0a09080709080706,                                                                                               \
+            0x0c0b0a090b0a0908,                                                                                               \
+            0x0e0d0c0b0d0c0b0a,                                                                                               \
+            0x000f0e0d0f0e0d0c,                                                                                               \
+            0x0201000f01000f0e);                                                                                              \
+        const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                                       \
+                                                                                                                              \
+        __mmask16 leading_bytes;                                                                                              \
+        const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                                                  \
+        const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                                              \
+        const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                                                  \
+        leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                                            \
+                                                                                                                              \
+        __m512i char_class;                                                                                                   \
+        char_class = _mm512_srli_epi32(input, 4);                                                                             \
+        /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                                                \
+        const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                                                  \
+        const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                                            \
+        char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);                                   \
+                                                                                                                              \
+        const int valid_count = static_cast<int>(count_ones(leading_bytes));                                                  \
+        const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                                      \
+                                                                                                                              \
+        const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);                         \
+                                                                                                                              \
+        if (UTF32) {                                                                                                          \
+            if (MASKED) {                                                                                                     \
+                const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                                     \
+                _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                                       \
+            } else {                                                                                                          \
+                _mm512_storeu_si512((__m512i*)output, out);                                                                   \
+            }                                                                                                                 \
+            output += valid_count;                                                                                            \
+        } else {                                                                                                              \
+            if (MASKED) {                                                                                                     \
+                output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t*>(output)); \
+            } else {                                                                                                          \
+                output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t*>(output));        \
+            }                                                                                                                 \
+        }                                                                                                                     \
+    }
+
+#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                                        \
+    {                                                                                                                           \
+        if (UTF32) {                                                                                                            \
+            if (MASKED) {                                                                                                       \
+                const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                                  \
+                _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                                  \
+            } else {                                                                                                            \
+                _mm512_storeu_si512((__m512i*)output, INPUT);                                                                   \
+            }                                                                                                                   \
+            output += VALID_COUNT;                                                                                              \
+        } else {                                                                                                                \
+            if (MASKED) {                                                                                                       \
+                output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t*>(output)); \
+            } else {                                                                                                            \
+                output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t*>(output));        \
+            }                                                                                                                   \
+        }                                                                                                                       \
+    }
+
+#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                                               \
+    if (UTF32) {                                                                                                       \
+        const __m128i t0 = _mm512_castsi512_si128(utf8);                                                               \
+        const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                                                         \
+        const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                                                         \
+        const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                                                         \
+        _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_cvtepu8_epi32(t0));                                    \
+        _mm512_storeu_si512((__m512i*)(output + 1 * 16), _mm512_cvtepu8_epi32(t1));                                    \
+        _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_cvtepu8_epi32(t2));                                    \
+        _mm512_storeu_si512((__m512i*)(output + 3 * 16), _mm512_cvtepu8_epi32(t3));                                    \
+    } else {                                                                                                           \
+        const __m256i h0 = _mm512_castsi512_si256(utf8);                                                               \
+        const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                                                         \
+        if (big_endian) {                                                                                              \
+            _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
+            _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
+        } else {                                                                                                       \
+            _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_cvtepu8_epi16(h0));                                \
+            _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_cvtepu8_epi16(h1));                                \
+        }                                                                                                              \
+    }
 /* end file src/icelake/icelake_macros.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
 /* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
 // file included directly
 
@@ -16539,23 +19272,23 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
     - pair.first    - the first unprocessed input byte
     - pair.second   - the first unprocessed output word
 */
-template <endianness big_endian, typename OUTPUT>
-std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
+template<endianness big_endian, typename OUTPUT>
+std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords)
+{
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
 
     __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
     const char* ptr = str;
     const char* end = ptr + len;
 
@@ -16570,7 +19303,7 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-        if(ascii == 0) {
+        if (ascii == 0) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16584,8 +19317,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if(valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+        if (valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -16603,8 +19336,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if(valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
+        if (valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -16614,14 +19347,14 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4*16;
+        ptr += 4 * 16;
     }
 
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-        if(ascii == 0) {
+        if (ascii == 0) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16633,8 +19366,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if(valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+            if (valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -16648,22 +19381,21 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3*16;
+            ptr += 3 * 16;
         }
     }
-    return {ptr, output};
+    return { ptr, output };
 }
 
-
 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
 /* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
 /* begin file src/icelake/icelake_utf8_validation.inl.cpp */
 // file included directly
 
-
-simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
-  __m512i mask1 = _mm512_setr_epi64(
+simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1)
+{
+    __m512i mask1 = _mm512_setr_epi64(
         0x0202020202020202,
         0x4915012180808080,
         0x0202020202020202,
@@ -16685,7 +19417,7 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0xcbcbdbcbcbcbcbcb,
         0xcbcbcb8b8383a3e7,
         0xcbcbdbcbcbcbcbcb);
-     __m512i index2 = _mm512_and_si512(prev1, v_0f);
+    __m512i index2 = _mm512_and_si512(prev1, v_0f);
 
     __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
     __m512i mask3 = _mm512_setr_epi64(
@@ -16696,19 +19428,19 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0x101010101010101,
         0x1010101babaaee6,
         0x101010101010101,
-        0x1010101babaaee6
-    );
+        0x1010101babaaee6);
     __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
     __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
     return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
-  }
+}
 
-  simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
-      const __m512i prev_input, const __m512i sc) {
+simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
+    const __m512i prev_input, const __m512i sc)
+{
     __m512i prev2 = prev<2>(input, prev_input);
     __m512i prev3 = prev<3>(input, prev_input);
-    __m512i is_third_byte  = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
-    __m512i is_fourth_byte  = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
+    __m512i is_third_byte = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
+    __m512i is_fourth_byte = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
     __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte);
     const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
     is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
@@ -16716,13 +19448,14 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
     const __m512i v_80 = _mm512_set1_epi8(char(0x80));
     return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010);
     //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80);
-    //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
-  }
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline __m512i is_incomplete(const __m512i input) {
+    // return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
+}
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline __m512i is_incomplete(const __m512i input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     __m512i max_value = _mm512_setr_epi64(
@@ -16735,59 +19468,63 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0xffffffffffffffff,
         0xbfdfefffffffffff);
     return _mm512_subs_epu8(input, max_value);
-  }
+}
 
-  struct avx512_utf8_checker {
+struct avx512_utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
-    __m512i error{};
+    __m512i error {};
 
     // The last input we received
-    __m512i prev_input_block{};
+    __m512i prev_input_block {};
     // Whether the last input we received was incomplete (used for ASCII fast path)
-    __m512i prev_incomplete{};
+    __m512i prev_incomplete {};
 
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      __m512i prev1 = prev<1>(input, prev_input);
-      __m512i sc = check_special_cases(input, prev1);
-      this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
+    simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        __m512i prev1 = prev<1>(input, prev_input);
+        __m512i sc = check_special_cases(input, prev1);
+        this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error = _mm512_or_si512(this->error, this->prev_incomplete);
     }
 
     // returns true if ASCII.
-    simdutf_really_inline bool check_next_input(const __m512i input) {
-      const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-      const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
-      if(ascii == 0) {
-        this->error = _mm512_or_si512(this->error, this->prev_incomplete);
-        return true;
-      } else {
-        this->check_utf8_bytes(input, this->prev_input_block);
-        this->prev_incomplete = is_incomplete(input);
-        this->prev_input_block = input;
-        return false;
-      }
+    simdutf_really_inline bool check_next_input(const __m512i input)
+    {
+        const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+        const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
+        if (ascii == 0) {
+            this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+            return true;
+        } else {
+            this->check_utf8_bytes(input, this->prev_input_block);
+            this->prev_incomplete = is_incomplete(input);
+            this->prev_input_block = input;
+            return false;
+        }
     }
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
+    simdutf_really_inline bool errors() const
+    {
         return _mm512_test_epi8_mask(this->error, this->error) != 0;
     }
 
-  }; // struct avx512_utf8_checker
+}; // struct avx512_utf8_checker
 /* end file src/icelake/icelake_utf8_validation.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
 /* begin file src/icelake/icelake_from_utf8.inl.cpp */
 // file included directly
 
@@ -16800,48 +19537,56 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
  * completed. Upon error, the output is set to null.
  */
 
-template <endianness big_endian>
-utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
-  const char *const final_in = in + len;
-  bool result = true;
-  while (result) {
-    if (in + 64 <= final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
-    } else if(in < final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
-    } else { break; }
-  }
-  if(!result) { out = nullptr; }
-  return std::make_pair(in, out);
-}
-
-template <endianness big_endian>
-simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) {
-  const char *const init_in = in;
-  const char16_t *const init_out = out;
-  const char *const final_in = in + len;
-  bool  result = true;
-  while (result) {
-    if (in + 64 <= final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
-    } else if(in < final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
-    } else { break; }
-  }
-  if(!result) {
-    // rewind_and_convert_with_errors will seek a potential error from in onward,
-    // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
-    simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
-    res.count += (in - init_in);
-    return res;
-  } else {
-    return simdutf::result(error_code::SUCCESS,out - init_out);
-  }
+template<endianness big_endian>
+utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char* in, size_t len, char16_t* out)
+{
+    const char* const final_in = in + len;
+    bool result = true;
+    while (result) {
+        if (in + 64 <= final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
+        } else if (in < final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
+        } else {
+            break;
+        }
+    }
+    if (!result) {
+        out = nullptr;
+    }
+    return std::make_pair(in, out);
 }
 
+template<endianness big_endian>
+simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char* in, size_t len, char16_t* out)
+{
+    const char* const init_in = in;
+    const char16_t* const init_out = out;
+    const char* const final_in = in + len;
+    bool result = true;
+    while (result) {
+        if (in + 64 <= final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
+        } else if (in < final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
+        } else {
+            break;
+        }
+    }
+    if (!result) {
+        // rewind_and_convert_with_errors will seek a potential error from in onward,
+        // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
+        simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
+        res.count += (in - init_in);
+        return res;
+    } else {
+        return simdutf::result(error_code::SUCCESS, out - init_out);
+    }
+}
 
-template <endianness big_endian, typename OUTPUT>
-std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
+template<endianness big_endian, typename OUTPUT>
+std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords)
+{
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
@@ -16850,17 +19595,16 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     const char* ptr = str;
     const char* end = ptr + len;
     __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
     OUTPUT* output = dwords;
-    avx512_utf8_checker checker{};
+    avx512_utf8_checker checker {};
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
@@ -16869,7 +19613,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
      */
     while (ptr + 64 + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16882,8 +19626,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if(valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+        if (valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -16901,8 +19645,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if(valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
+        if (valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -16912,7 +19656,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4*16;
+        ptr += 4 * 16;
     }
     const char* validatedptr = ptr; // validated up to ptr
 
@@ -16920,7 +19664,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     // 3*16 bytes, so we may end up double-validating 16 bytes.
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16932,8 +19676,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if(valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+            if (valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -16947,24 +19691,25 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3*16;
+            ptr += 3 * 16;
         }
-        validatedptr += 4*16;
+        validatedptr += 4 * 16;
     }
     {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
-       checker.check_next_input(utf8);
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - validatedptr)) - 1, (const __m512i*)validatedptr);
+        checker.check_next_input(utf8);
     }
     checker.check_eof();
-    if(checker.errors()) {
-        return {ptr, nullptr}; // We found an error.
+    if (checker.errors()) {
+        return { ptr, nullptr }; // We found an error.
     }
-    return {ptr, output};
+    return { ptr, output };
 }
 
 // Like validating_utf8_to_fixed_length but returns as soon as an error is identified
-template <endianness big_endian, typename OUTPUT>
-std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
+template<endianness big_endian, typename OUTPUT>
+std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords)
+{
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
@@ -16973,17 +19718,16 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     const char* ptr = str;
     const char* end = ptr + len;
     __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
     OUTPUT* output = dwords;
-    avx512_utf8_checker checker{};
+    avx512_utf8_checker checker {};
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
@@ -16992,14 +19736,14 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
      */
     while (ptr + 64 + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
             continue;
         }
-        if(checker.errors()) {
-            return {ptr, output, false}; // We found an error.
+        if (checker.errors()) {
+            return { ptr, output, false }; // We found an error.
         }
         const __m512i lane0 = broadcast_epi128<0>(utf8);
         const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -17008,8 +19752,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if(valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+        if (valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -17027,8 +19771,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if(valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
+        if (valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -17038,7 +19782,7 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4*16;
+        ptr += 4 * 16;
     }
     const char* validatedptr = ptr; // validated up to ptr
 
@@ -17046,12 +19790,12 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     // 3*16 bytes, so we may end up double-validating 16 bytes.
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
-        } else if(checker.errors()) {
-            return {ptr, output, false}; // We found an error.
+        } else if (checker.errors()) {
+            return { ptr, output, false }; // We found an error.
         } else {
             const __m512i lane0 = broadcast_epi128<0>(utf8);
             const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -17060,8 +19804,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if(valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+            if (valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -17075,22 +19819,22 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3*16;
+            ptr += 3 * 16;
         }
-        validatedptr += 4*16;
+        validatedptr += 4 * 16;
     }
     {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
-       checker.check_next_input(utf8);
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - validatedptr)) - 1, (const __m512i*)validatedptr);
+        checker.check_next_input(utf8);
     }
     checker.check_eof();
-    if(checker.errors()) {
-        return {ptr, output, false}; // We found an error.
+    if (checker.errors()) {
+        return { ptr, output, false }; // We found an error.
     }
-    return {ptr, output, true};
+    return { ptr, output, true };
 }
 /* end file src/icelake/icelake_from_utf8.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
 /* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
 // file included directly
 
@@ -17098,755 +19842,786 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
   Returns a pair: the first unprocessed byte from buf and utf32_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* end = buf + len;
-  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
-  __mmask32 carry{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (buf + 32 <= end) {
-    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
-    __m512i in = _mm512_loadu_si512((__m512i*)buf);
-    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-
-    // H - bitmask for high surrogates
-    const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
-    // H - bitmask for low surrogates
-    const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
-
-    if ((H|L)) {
-      // surrogate pair(s) in a register
-      const __mmask32 V = (L ^ (carry | (H << 1)));   // A high surrogate must be followed by low one and a low one must be preceded by a high one.
-                                                      // If valid, V should be equal to 0
-
-      if(V == 0) {
-        // valid case
-        /*
-            Input surrogate pair:
-            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
-                low surrogate      high surrogate
-        */
-        /*  1. Expand all words to 32-bit words
-            in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-        */
-        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-        const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1));
-
-        /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
-            in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-            shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-        */
-        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
-        const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
-
-        /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
-            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-        */
-        const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
-        const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10);
-
-        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
-            in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-            shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
-        */
-        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
-        const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
-        const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
-
-        const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second);
-        const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant);
-
-        //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
-        const __mmask32 valid = ~L & 0x7fffffff;
-        // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
-        // to ease performance portability to Zen 4.
-        const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
-        const size_t howmany1 = count_ones((uint16_t)(valid));
-        _mm512_storeu_si512((__m512i *) utf32_output,  compressed_first);
-        utf32_output += howmany1;
-        const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
-        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
-        // The following could be unsafe in some cases?
-        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
-        _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<<howmany2)-1), compressed_second);
-        utf32_output += howmany2;
-        // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
-        buf += 31;
-        carry = (H >> 30) & 0x1;
-      } else {
-        // invalid case
-        return std::make_tuple(buf+carry, utf32_output, false);
-      }
-    } else {
-      // no surrogates
-      // extend all thirty-two 16-bit words to thirty-two 32-bit words
-      _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
-      _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)));
-      utf32_output += 32;
-      buf += 32;
-      carry = 0;
-    }
-  } // while
-  return std::make_tuple(buf+carry, utf32_output, true);
+template<endianness big_endian>
+std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* end = buf + len;
+    const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+    const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
+    __mmask32 carry { 0 };
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (buf + 32 <= end) {
+        // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
+        __m512i in = _mm512_loadu_si512((__m512i*)buf);
+        if (big_endian) {
+            in = _mm512_shuffle_epi8(in, byteflip);
+        }
+
+        // H - bitmask for high surrogates
+        const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
+        // H - bitmask for low surrogates
+        const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
+
+        if ((H | L)) {
+            // surrogate pair(s) in a register
+            const __mmask32 V = (L ^ (carry | (H << 1))); // A high surrogate must be followed by low one and a low one must be preceded by a high one.
+                                                          // If valid, V should be equal to 0
+
+            if (V == 0) {
+                // valid case
+                /*
+                    Input surrogate pair:
+                    |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
+                        low surrogate      high surrogate
+                */
+                /*  1. Expand all words to 32-bit words
+                    in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+                */
+                const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+                const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+                /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
+                    in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+                    shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+                */
+                const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
+                const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+
+                /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
+                    |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+                */
+                const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
+                const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+
+                /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
+                    in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+                    shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+                    constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
+                */
+                const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
+                const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
+                const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
+
+                const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16), aligned_second, shifted_second);
+                const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H >> 16), added_second, constant);
+
+                //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
+                const __mmask32 valid = ~L & 0x7fffffff;
+                // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
+                // to ease performance portability to Zen 4.
+                const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
+                const size_t howmany1 = count_ones((uint16_t)(valid));
+                _mm512_storeu_si512((__m512i*)utf32_output, compressed_first);
+                utf32_output += howmany1;
+                const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
+                const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
+                // The following could be unsafe in some cases?
+                //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
+                _mm512_mask_storeu_epi32((__m512i*)utf32_output, __mmask16((1 << howmany2) - 1), compressed_second);
+                utf32_output += howmany2;
+                // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
+                buf += 31;
+                carry = (H >> 30) & 0x1;
+            } else {
+                // invalid case
+                return std::make_tuple(buf + carry, utf32_output, false);
+            }
+        } else {
+            // no surrogates
+            // extend all thirty-two 16-bit words to thirty-two 32-bit words
+            _mm512_storeu_si512((__m512i*)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
+            _mm512_storeu_si512((__m512i*)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
+            utf32_output += 32;
+            buf += 32;
+            carry = 0;
+        }
+    } // while
+    return std::make_tuple(buf + carry, utf32_output, true);
 }
 /* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
 /* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
 // file included directly
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
-
-  return std::make_pair(buf, utf8_output);
+std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    __m256i running_max = _mm256_setzero_si256();
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    // check for invalid input
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    return std::make_pair(buf, utf8_output);
 }
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const char32_t* start = buf;
-
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    // Check for too large input
-    const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-      // Check for illegal surrogate words
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-      }
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const char32_t* start = buf;
+
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        // Check for too large input
+        const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+        if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+            // Check for illegal surrogate words
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+            }
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
 /* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
 // file included directly
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* end = buf + len;
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* end = buf + len;
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
 
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
+    // check for invalid input
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf16_output);
+    }
 
-  return std::make_pair(buf, utf16_output);
+    return std::make_pair(buf, utf16_output);
 }
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
+template<endianness big_endian>
+std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
-
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-      }
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+            }
+
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
 /* begin file src/icelake/icelake_ascii_validation.inl.cpp */
 // file included directly
 
-bool validate_ascii(const char* buf, size_t len) {
-  const char* end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  __m512i running_or = _mm512_setzero_si512();
-  for (; buf + 64 <= end; buf += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
-  }
-  if(buf < end) {
-     const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
-  }
-  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
+bool validate_ascii(const char* buf, size_t len)
+{
+    const char* end = buf + len;
+    const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+    __m512i running_or = _mm512_setzero_si512();
+    for (; buf + 64 <= end; buf += 64) {
+        const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
+        running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
+    }
+    if (buf < end) {
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end - buf)) - 1, (const __m512i*)buf);
+        running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
+    }
+    return (_mm512_test_epi8_mask(running_or, running_or) == 0);
 }
 /* end file src/icelake/icelake_ascii_validation.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
 /* begin file src/icelake/icelake_utf32_validation.inl.cpp */
 // file included directly
 
-const char32_t* validate_utf32(const char32_t* buf, size_t len) {
+const char32_t* validate_utf32(const char32_t* buf, size_t len)
+{
     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
 
     const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
@@ -17854,27 +20629,27 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len) {
     __m512i currentoffsetmax = _mm512_setzero_si512();
 
     while (buf <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-      buf += 16;
-      currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
-      currentmax = _mm512_max_epu32(utf32, currentmax);
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+        buf += 16;
+        currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
+        currentmax = _mm512_max_epu32(utf32, currentmax);
     }
 
     const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
     const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
     __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-      return nullptr;
+        return nullptr;
     }
     is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-      return nullptr;
+        return nullptr;
     }
 
     return buf;
 }
 /* end file src/icelake/icelake_utf32_validation.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp
 /* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
 // file included directly
 
@@ -17884,194 +20659,188 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len) {
  * is written to 'outlen' and the function reports the number of input word
  * consumed.
  */
-template <endianness big_endian>
-size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
-                               unsigned char *outbuf, size_t *outlen) {
-  __m512i in;
-  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
-  __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  const char16_t * const inbuf_orig = inbuf;
-  const unsigned char * const outbuf_orig = outbuf;
-  size_t adjust = 0;
-  int carry = 0;
-
-  while (inlen >= 32) {
-    in = _mm512_loadu_si512(inbuf);
-    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-    inlen -= 31;
-  lastiteration:
-    inbuf += 31;
-
-  failiteration:
-    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
-      inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
-
-    if (_ktestz_mask32_u8(inmask, is234byte)) {
-      // fast path for ASCII only
-      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
-      outbuf += 31;
-      carry = 0;
-
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
-
-    const __mmask32 is12byte =
-        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
-
-    if (_ktestc_mask32_u8(is12byte, inmask)) {
-      // fast path for 1 and 2 byte only
-
-      const __m512i twobytes = _mm512_ternarylogic_epi32(
-          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
-          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
-      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
-                                 _mm512_set1_epi16(int16_t(0x80c0)));
-      const __m512i cmpmask =
-          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
-                                  _mm512_set1_epi16(0x0800));
-      const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
-      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
-      _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
-                              out);
-      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
-      carry = 0;
-
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
-    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
-
-
-    __m512i taglo = _mm512_set1_epi32(0x8080e000);
-    __m512i taghi = taglo;
-
-    const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
-    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
-        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
-    const __mmask32 losurr = _mm512_cmp_epu16_mask(
-        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
-
-    int carryout = 0;
-    if (!_kortestz_mask32_u8(hisurr, losurr)) {
-      // handle surrogates
-
-      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
-      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
-
-      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
-      taglo =
-          _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
-      taghi =
-          _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
-
-      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
-      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
-      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
-      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
-      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
-      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
-
-      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
-
-      const uint32_t  h = _cvtmask32_u32(hisurr);
-      const uint32_t  l = _cvtmask32_u32(losurr);
-      // check for mismatched surrogates
-      if ((h + h + carry) ^ l) {
-        const uint32_t lonohi = l & ~(h + h + carry);
-        const uint32_t hinolo = h & ~(l >> 1);
-        inlen = _tzcnt_u32(hinolo | lonohi);
-        inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
-        in = _mm512_maskz_mov_epi16(inmask, in);
-        adjust = (int)inlen - 31;
-        inlen = 0;
-        goto failiteration;
-      }
-    }
+template<endianness big_endian>
+size_t utf16_to_utf8_avx512i(const char16_t* inbuf, size_t inlen,
+    unsigned char* outbuf, size_t* outlen)
+{
+    __m512i in;
+    __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
+    __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    const char16_t* const inbuf_orig = inbuf;
+    const unsigned char* const outbuf_orig = outbuf;
+    size_t adjust = 0;
+    int carry = 0;
+
+    while (inlen >= 32) {
+        in = _mm512_loadu_si512(inbuf);
+        if (big_endian) {
+            in = _mm512_shuffle_epi8(in, byteflip);
+        }
+        inlen -= 31;
+    lastiteration:
+        inbuf += 31;
+
+    failiteration:
+        const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
+            inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
+
+        if (_ktestz_mask32_u8(inmask, is234byte)) {
+            // fast path for ASCII only
+            _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
+            outbuf += 31;
+            carry = 0;
+
+            if (inlen < 32) {
+                goto tail;
+            } else {
+                continue;
+            }
+        }
 
-    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi);
-    carry = carryout;
+        const __mmask32 is12byte = _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
+
+        if (_ktestc_mask32_u8(is12byte, inmask)) {
+            // fast path for 1 and 2 byte only
+
+            const __m512i twobytes = _mm512_ternarylogic_epi32(
+                _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
+                _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
+            in = _mm512_mask_add_epi16(in, is234byte, twobytes,
+                _mm512_set1_epi16(int16_t(0x80c0)));
+            const __m512i cmpmask = _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
+                _mm512_set1_epi16(0x0800));
+            const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
+            const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
+            _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
+                out);
+            outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
+            carry = 0;
+
+            if (inlen < 32) {
+                goto tail;
+            } else {
+                continue;
+            }
+        }
+        __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+        __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+        __m512i taglo = _mm512_set1_epi32(0x8080e000);
+        __m512i taghi = taglo;
+
+        const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
+        const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
+            inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
+        const __mmask32 losurr = _mm512_cmp_epu16_mask(
+            fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
+
+        int carryout = 0;
+        if (!_kortestz_mask32_u8(hisurr, losurr)) {
+            // handle surrogates
+
+            __m512i los = _mm512_alignr_epi32(hi, lo, 1);
+            __m512i his = _mm512_alignr_epi32(lo, hi, 1);
+
+            const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
+            taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
+            taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
+
+            lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
+            hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
+            los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
+            his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
+            lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
+            hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
+
+            carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
+
+            const uint32_t h = _cvtmask32_u32(hisurr);
+            const uint32_t l = _cvtmask32_u32(losurr);
+            // check for mismatched surrogates
+            if ((h + h + carry) ^ l) {
+                const uint32_t lonohi = l & ~(h + h + carry);
+                const uint32_t hinolo = h & ~(l >> 1);
+                inlen = _tzcnt_u32(hinolo | lonohi);
+                inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
+                in = _mm512_maskz_mov_epi16(inmask, in);
+                adjust = (int)inlen - 31;
+                inlen = 0;
+                goto failiteration;
+            }
+        }
 
-    __m512i mslo =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
+        hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
+        carry = carryout;
 
-    __m512i mshi =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
+        __m512i mslo = _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
 
-    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
-    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
+        __m512i mshi = _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
 
-    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
-    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
-    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
+        const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
+        const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
 
-    taglo =
-        _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
-    taghi =
-        _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
-    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
-    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
+        const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
+        const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
+        const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
 
+        taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
+        taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
+        __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
+        __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
 
-    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
-    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
+        magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
+        magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
 
-    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
-                                     0xea); // A&B|C
-    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
-                                     0xea);
-    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
+        mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
+            0xea); // A&B|C
+        mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
+            0xea);
+        mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
 
-    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
+        mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
 
-    const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
-    const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
-    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
-    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
-    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
-    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
+        const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
+        const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
+        const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
+        const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
+        const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
+        const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
 
-    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
-    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+        uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
+        uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
 
-    _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
-    _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
-    outbuf += advlo + advhi;
-  }
-  outbuf -= adjust;
+        _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
+        _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
+        outbuf += advlo + advhi;
+    }
+    outbuf -= adjust;
 
 tail:
-  if (inlen != 0) {
-    // We must have inlen < 31.
-    inmask = _cvtu32_mask32((1 << inlen) - 1);
-    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
-    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-    adjust = inlen - 31;
-    inlen = 0;
-    goto lastiteration;
-  }
-  *outlen = (outbuf - outbuf_orig) + adjust;
-  return ((inbuf - inbuf_orig) + adjust);
+    if (inlen != 0) {
+        // We must have inlen < 31.
+        inmask = _cvtu32_mask32((1 << inlen) - 1);
+        in = _mm512_maskz_loadu_epi16(inmask, inbuf);
+        if (big_endian) {
+            in = _mm512_shuffle_epi8(in, byteflip);
+        }
+        adjust = inlen - 31;
+        inlen = 0;
+        goto lastiteration;
+    }
+    *outlen = (outbuf - outbuf_orig) + adjust;
+    return ((inbuf - inbuf_orig) + adjust);
 }
 /* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
 
@@ -18082,141 +20851,138 @@ tail:
 namespace simdutf {
 namespace icelake {
 
-
 simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    const char *buf = input;
+implementation::detect_encodings(const char* input,
+    size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        const char* buf = input;
+
+        const char* start = buf;
+        const char* end = input + length;
+
+        bool is_utf8 = true;
+        bool is_utf16 = true;
+        bool is_utf32 = true;
+
+        int out = 0;
+
+        avx512_utf8_checker checker {};
+        __m512i currentmax = _mm512_setzero_si512();
+        while (buf + 64 <= end) {
+            __m512i in = _mm512_loadu_si512((__m512i*)buf);
+            __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+            __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+            if (surrogates) {
+                is_utf8 = false;
+
+                // Can still be either UTF-16LE or UTF-32 depending on the positions
+                // of the surrogates To be valid UTF-32, a surrogate cannot be in the
+                // two most significant bytes of any 32-bit word. On the other hand, to
+                // be valid UTF-16LE, at least one surrogate must be in the two most
+                // significant bytes of a 32-bit word since they always come in pairs in
+                // UTF-16LE. Note that we always proceed in multiple of 4 before this
+                // point so there is no offset in 32-bit words.
+
+                if ((surrogates & 0xaaaaaaaa) != 0) {
+                    is_utf32 = false;
+                    __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
+                        diff, _mm512_set1_epi16(uint16_t(0x0400)));
+                    __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+                    // high must be followed by low
+                    if ((highsurrogates << 1) != lowsurrogates) {
+                        return simdutf::encoding_type::unspecified;
+                    }
 
-    const char *start = buf;
-    const char *end = input + length;
+                    bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+                    if (ends_with_high) {
+                        buf += 31 * sizeof(char16_t); // advance only by 31 words so that we start
+                                                      // with the high surrogate on the next round.
+                    } else {
+                        buf += 32 * sizeof(char16_t);
+                    }
+                    is_utf16 = validate_utf16le(reinterpret_cast<const char16_t*>(buf),
+                        (end - buf) / sizeof(char16_t));
+                    if (!is_utf16) {
+                        return simdutf::encoding_type::unspecified;
 
-    bool is_utf8 = true;
-    bool is_utf16 = true;
-    bool is_utf32 = true;
+                    } else {
+                        return simdutf::encoding_type::UTF16_LE;
+                    }
 
-    int out = 0;
+                } else {
+                    is_utf16 = false;
+                    // Check for UTF-32
+                    if (length % 4 == 0) {
+                        const char32_t* input32 = reinterpret_cast<const char32_t*>(buf);
+                        const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + length / 4;
+                        if (validate_utf32(input32, end32 - input32)) {
+                            return simdutf::encoding_type::UTF32_LE;
+                        }
+                    }
+                    return simdutf::encoding_type::unspecified;
+                }
+                break;
+            }
+            // If no surrogate, validate under other encodings as well
 
-    avx512_utf8_checker checker{};
-    __m512i currentmax = _mm512_setzero_si512();
-    while (buf + 64 <= end) {
-      __m512i in = _mm512_loadu_si512((__m512i *)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if (surrogates) {
-        is_utf8 = false;
-
-        // Can still be either UTF-16LE or UTF-32 depending on the positions
-        // of the surrogates To be valid UTF-32, a surrogate cannot be in the
-        // two most significant bytes of any 32-bit word. On the other hand, to
-        // be valid UTF-16LE, at least one surrogate must be in the two most
-        // significant bytes of a 32-bit word since they always come in pairs in
-        // UTF-16LE. Note that we always proceed in multiple of 4 before this
-        // point so there is no offset in 32-bit words.
-
-        if ((surrogates & 0xaaaaaaaa) != 0) {
-          is_utf32 = false;
-          __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
-              diff, _mm512_set1_epi16(uint16_t(0x0400)));
-          __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-          // high must be followed by low
-          if ((highsurrogates << 1) != lowsurrogates) {
-            return simdutf::encoding_type::unspecified;
-          }
-
-          bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-          if (ends_with_high) {
-            buf +=
-                31 *
-                sizeof(char16_t); // advance only by 31 words so that we start
-                                  // with the high surrogate on the next round.
-          } else {
-            buf += 32 * sizeof(char16_t);
-          }
-          is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
-                                      (end - buf) / sizeof(char16_t));
-          if (!is_utf16) {
-            return simdutf::encoding_type::unspecified;
+            // UTF-32 validation
+            currentmax = _mm512_max_epu32(in, currentmax);
 
-          } else {
-            return simdutf::encoding_type::UTF16_LE;
-          }
+            // UTF-8 validation
+            checker.check_next_input(in);
 
-        } else {
-          is_utf16 = false;
-          // Check for UTF-32
-          if (length % 4 == 0) {
-            const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
-            const char32_t *end32 =
-                reinterpret_cast<const char32_t *>(start) + length / 4;
-            if (validate_utf32(input32, end32 - input32)) {
-              return simdutf::encoding_type::UTF32_LE;
-            }
-          }
-          return simdutf::encoding_type::unspecified;
+            buf += 64;
         }
-        break;
-      }
-      // If no surrogate, validate under other encodings as well
-
-      // UTF-32 validation
-      currentmax = _mm512_max_epu32(in, currentmax);
 
-      // UTF-8 validation
-      checker.check_next_input(in);
-
-      buf += 64;
-    }
+        // Check which encodings are possible
 
-    // Check which encodings are possible
+        if (is_utf8) {
+            size_t current_length = static_cast<size_t>(buf - start);
+            if (current_length != length) {
+                const __m512i utf8 = _mm512_maskz_loadu_epi8(
+                    (1ULL << (length - current_length)) - 1, (const __m512i*)buf);
+                checker.check_next_input(utf8);
+            }
+            checker.check_eof();
+            if (!checker.errors()) {
+                out |= simdutf::encoding_type::UTF8;
+            }
+        }
 
-    if (is_utf8) {
-      size_t current_length = static_cast<size_t>(buf - start);
-      if (current_length != length) {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8(
-            (1ULL << (length - current_length)) - 1, (const __m512i *)buf);
-        checker.check_next_input(utf8);
-      }
-      checker.check_eof();
-      if (!checker.errors()) {
-        out |= simdutf::encoding_type::UTF8;
-      }
-    }
+        if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (length - (buf - start)) / 2)) {
+            out |= simdutf::encoding_type::UTF16_LE;
+        }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
-                        reinterpret_cast<const char16_t *>(buf),
-                        (length - (buf - start)) / 2)) {
-      out |= simdutf::encoding_type::UTF16_LE;
-    }
+        if (is_utf32 && (length % 4 == 0)) {
+            currentmax = _mm512_max_epu32(
+                _mm512_maskz_loadu_epi8(
+                    (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
+                    (const __m512i*)buf),
+                currentmax);
+            __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
+                _MM_CMPINT_GT);
+            if (outside_range == 0) {
+                out |= simdutf::encoding_type::UTF32_LE;
+            }
+        }
 
-    if (is_utf32 && (length % 4 == 0)) {
-      currentmax = _mm512_max_epu32(
-          _mm512_maskz_loadu_epi8(
-              (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
-              (const __m512i *)buf),
-          currentmax);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range == 0) {
-        out |= simdutf::encoding_type::UTF32_LE;
-      }
+        return out;
+    } else if (implementation::validate_utf8(input, length)) {
+        return simdutf::encoding_type::UTF8;
+    } else {
+        return simdutf::encoding_type::unspecified;
     }
-
-    return out;
-  } else if (implementation::validate_utf8(input, length)) {
-    return simdutf::encoding_type::UTF8;
-  } else {
-    return simdutf::encoding_type::unspecified;
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-    avx512_utf8_checker checker{};
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    avx512_utf8_checker checker {};
     const char* ptr = buf;
     const char* end = ptr + len;
     for (; ptr + 64 <= end; ptr += 64) {
@@ -18224,969 +20990,1163 @@ simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t l
         checker.check_next_input(utf8);
     }
     {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
-       checker.check_next_input(utf8);
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - ptr)) - 1, (const __m512i*)ptr);
+        checker.check_next_input(utf8);
     }
     checker.check_eof();
-    return ! checker.errors();
+    return !checker.errors();
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-    avx512_utf8_checker checker{};
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    avx512_utf8_checker checker {};
     const char* ptr = buf;
     const char* end = ptr + len;
-    size_t count{0};
+    size_t count { 0 };
     for (; ptr + 64 <= end; ptr += 64) {
-      const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-      checker.check_next_input(utf8);
-      if(checker.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
-        res.count += count;
-        return res;
-      }
-      count += 64;
+        const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
+        checker.check_next_input(utf8);
+        if (checker.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
+            res.count += count;
+            return res;
+        }
+        count += 64;
     }
     {
-      const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
-      checker.check_next_input(utf8);
-      if(checker.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
-        res.count += count;
-        return res;
-      } else {
-        return result(error_code::SUCCESS, len);
-      }
-    }
-}
-
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return icelake::validate_ascii(buf, len);
-}
-
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  const char* buf_orig = buf;
-  const char* end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  for (; buf + 64 <= end; buf += 64) {
-    const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if(notascii) {
-      return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  {
-    const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if(notascii) {
-      return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  return result(error_code::SUCCESS, len);
-}
-
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-    const char16_t *end = buf + len;
-
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_loadu_si512((__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - ptr)) - 1, (const __m512i*)ptr);
+        checker.check_next_input(utf8);
+        if (checker.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
+            res.count += count;
+            return res;
+        } else {
+            return result(error_code::SUCCESS, len);
+        }
+    }
+}
+
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return icelake::validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    const char* buf_orig = buf;
+    const char* end = buf + len;
+    const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+    for (; buf + 64 <= end; buf += 64) {
+        const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
+        __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+        if (notascii) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
+        }
+    }
+    {
+        const __m512i input = _mm512_maskz_loadu_epi8((1ULL << (end - buf)) - 1, (const __m512i*)buf);
+        __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+        if (notascii) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
+        }
+    }
+    return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* end = buf + len;
+
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_loadu_si512((__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
+            buf += 32;
         }
-      } else {
-        buf += 32;
-      }
     }
-    if(buf < end) {
-      __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
+    if (buf < end) {
+        __m512i in = _mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
         }
-      }
     }
     return true;
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-   const char16_t *end = buf + len;
-   const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* end = buf + len;
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
+            buf += 32;
         }
-      } else {
-        buf += 32;
-      }
     }
-    if(buf < end) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
+    if (buf < end) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
         }
-      }
     }
     return true;
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-    const char16_t *start_buf = buf;
-    const char16_t *end = buf + len;
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_loadu_si512((__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* start_buf = buf;
+    const char16_t* end = buf + len;
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_loadu_si512((__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
-        }
-      } else {
-        buf += 32;
-      }
-    }
-    if(buf < end) {
-      __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-      }
+            buf += 32;
+        }
+    }
+    if (buf < end) {
+        __m512i in = _mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+        }
     }
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-    const char16_t *start_buf = buf;
-    const char16_t *end = buf + len;
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* start_buf = buf;
+    const char16_t* end = buf + len;
     const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
-        }
-      } else {
-        buf += 32;
-      }
-    }
-    if(buf < end) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-      }
+            buf += 32;
+        }
+    }
+    if (buf < end) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+        }
     }
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t * tail = icelake::validate_utf32(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = icelake::validate_utf32(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
 
     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
     const char32_t* buf_orig = buf;
     while (buf <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
-      }
-
-      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-                                _MM_CMPINT_GT);
-      if (surrogate_range) {
-        return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
-      }
-      buf += 16;
-    }
-    if(buf < buf_orig + len) {
-      __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
-      }
-      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-                                _MM_CMPINT_GT);
-      if (surrogate_range) {
-        return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
-      }
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+        __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+            _MM_CMPINT_GT);
+        if (outside_range) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
+        }
+
+        __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+        __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+            _MM_CMPINT_GT);
+        if (surrogate_range) {
+            return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
+        }
+        buf += 16;
+    }
+    if (buf < buf_orig + len) {
+        __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1 << (buf_orig + len - buf)) - 1), (const __m512i*)buf);
+        __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+            _MM_CMPINT_GT);
+        if (outside_range) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
+        }
+        __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+        __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+            _MM_CMPINT_GT);
+        if (surrogate_range) {
+            return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
+        }
     }
 
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
-  }
-  return ret.second - utf16_output;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
-  }
-  return ret.second - utf16_output;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.second == nullptr) {
+        return 0;
+    }
+    return ret.second - utf16_output;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.second == nullptr) {
+        return 0;
+    }
+    return ret.second - utf16_output;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
+    size_t saved_bytes = ret.second - utf16_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outsiede 16-byte window.
+    //       It meas, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
+    size_t saved_bytes = ret.second - utf16_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outsiede 16-byte window.
+    //       It meas, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+    if (ret.second == nullptr)
+        return 0;
+
+    size_t saved_bytes = ret.second - utf32_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: the AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outside 16-byte window.
+    //       It means, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
+            ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
     return saved_bytes;
-  }
-
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
-
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32);
+    auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        auto new_buf = std::get<0>(ret);
+        // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
+        // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t*>(std::get<1>(ret)));
+        res.count += (std::get<0>(ret) - buf);
+        return res;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    const char* end = buf + len;
+    if (std::get<0>(ret) == end) {
+        return { simdutf::SUCCESS, saved_bytes };
+    }
+
+    // Note: the AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outside 16-byte window.
+    //       It means, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
+        std::get<0>(ret) += 1;
+    }
+
+    if (std::get<0>(ret) != end) {
+        auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t*>(utf32_output) + saved_bytes);
+        if (scalar_result.error != simdutf::SUCCESS) {
+            scalar_result.count += (std::get<0>(ret) - buf);
+        } else {
+            scalar_result.count += saved_bytes;
+        }
+        return scalar_result;
+    }
+
+    return { simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output) };
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+    size_t saved_bytes = ret.second - utf32_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outsiede 16-byte window.
+    //       It meas, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
+            ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
     return saved_bytes;
-  }
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        return 0;
+    }
+    return outlen;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        return 0;
+    }
+    return outlen;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
+        res.count += inlen;
+        return res;
+    }
+    return { simdutf::SUCCESS, outlen };
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
+        res.count += inlen;
+        return res;
+    }
+    return { simdutf::SUCCESS, outlen };
+}
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
 
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-  if (ret.second == nullptr)
-    return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
 
-  size_t saved_bytes = ret.second - utf32_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
     return saved_bytes;
-  }
-
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
-
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
-                                        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
-  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    auto new_buf = std::get<0>(ret);
-    // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
-    // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
-    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
-    res.count += (std::get<0>(ret) - buf);
-    return res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  const char* end = buf + len;
-  if (std::get<0>(ret) == end) {
-    return {simdutf::SUCCESS, saved_bytes};
-  }
-
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
-      std::get<0>(ret) += 1;
-  }
-
-  if (std::get<0>(ret) != end) {
-    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
-    if (scalar_result.error != simdutf::SUCCESS) {
-      scalar_result.count +=  (std::get<0>(ret) - buf);
-    } else {
-      scalar_result.count += saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
     }
-    return scalar_result;
-  }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
 
-  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-  size_t saved_bytes = ret.second - utf32_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
     return saved_bytes;
-  }
-
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
-
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
-                                        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) { return 0; }
-  return outlen;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) { return 0; }
-  return outlen;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
-    }
-  }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
-    }
-  }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  size_t pos = 0;
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (pos + 32 <= length) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_storeu_si512(output + pos, utf16);
-    pos += 32;
-  }
-  if(pos < length) {
-    __mmask32 m((1<< (length - pos))-1);
-    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_mask_storeu_epi16(output + pos, m, utf16);
-  }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        scalar_res.count += (std::get<0>(ret) - buf);
+        return scalar_res;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_res.error) {
+            scalar_res.count += (std::get<0>(ret) - buf);
+            return scalar_res;
+        } else {
+            scalar_res.count += saved_bytes;
+            return scalar_res;
+        }
+    }
+    return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        scalar_res.count += (std::get<0>(ret) - buf);
+        return scalar_res;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_res.error) {
+            scalar_res.count += (std::get<0>(ret) - buf);
+            return scalar_res;
+        } else {
+            scalar_res.count += saved_bytes;
+            return scalar_res;
+        }
+    }
+    return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
 
-  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    size_t pos = 0;
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (pos + 32 <= length) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
+        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+        _mm512_storeu_si512(output + pos, utf16);
+        pos += 32;
+    }
+    if (pos < length) {
+        __mmask32 m((1 << (length - pos)) - 1);
+        __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
+        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+        _mm512_mask_storeu_epi16(output + pos, m, utf16);
+    }
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  size_t count{0};
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 32;
-    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-    count += count_ones(not_high_surrogate);
-  }
+    size_t count { 0 };
 
-  return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 32;
+        uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+        count += count_ones(not_high_surrogate);
+    }
+
+    return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-  size_t count{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
-    ptr += 32;
-    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-    count += count_ones(not_high_surrogate);
-  }
+    size_t count { 0 };
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
+        ptr += 32;
+        uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+        count += count_ones(not_high_surrogate);
+    }
 
-  return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
+    return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
 }
 
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    const char* end = length >= 64 ? input + length - 64 : nullptr;
+    const char* ptr = input;
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  const char* end = length >= 64 ? input + length - 64 : nullptr;
-  const char* ptr = input;
+    const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
-  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
+    size_t count { 0 };
 
-  size_t count{0};
+    while (ptr <= end) {
+        __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 64;
+        uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
+        count += 64 - count_ones(continuation_bitmask);
+    }
 
-  while (ptr <= end) {
-    __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 64;
-    uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
-    count += 64 - count_ones(continuation_bitmask);
-  }
+    return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
+}
 
-  return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
 }
 
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  size_t count{0};
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 32;
-    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+    size_t count { 0 };
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 32;
+        __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+        __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+        __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+        __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
-  }
+        size_t ascii_count = count_ones(ascii_bitmask);
+        size_t two_bytes_count = count_ones(two_bytes_bitmask);
+        size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+        size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
 
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
+        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 2 * surrogate_bytes_count;
+    }
+
+    return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-  size_t count{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    ptr += 32;
-    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+    size_t count { 0 };
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+        ptr += 32;
+        __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+        __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+        __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+        __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+
+        size_t ascii_count = count_ones(ascii_bitmask);
+        size_t two_bytes_count = count_ones(two_bytes_bitmask);
+        size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+        size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 2 * surrogate_bytes_count;
+    }
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
-  }
+    return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return implementation::count_utf16le(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return implementation::count_utf16be(input, length);
+}
 
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return implementation::count_utf16le(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return implementation::count_utf16be(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= length; pos += 64) {
-      __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
-      uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= length; pos += 64) {
+        __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input + pos));
+        uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
-  const char32_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
+    const char32_t* ptr = input;
 
-  const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
-  const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
-  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-  size_t count{0};
+    size_t count { 0 };
 
-  while (ptr <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 16;
-    __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
-    __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
-    __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
+    while (ptr <= end) {
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 16;
+        __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+        __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+        __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t three_bytes_count = count_ones(three_bytes_bitmask);
-    size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
-  }
+        size_t ascii_count = count_ones(ascii_bitmask);
+        size_t two_bytes_count = count_ones(two_bytes_bitmask);
+        size_t three_bytes_count = count_ones(three_bytes_bitmask);
+        size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
+        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 4 * four_bytes_count;
+    }
 
-  return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
+    return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
-  const char32_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
+    const char32_t* ptr = input;
 
-  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-  size_t count{0};
+    size_t count { 0 };
 
-  while (ptr <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 16;
-    __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+    while (ptr <= end) {
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 16;
+        __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
-    count += 16 + count_ones(surrogates_bitmask);
-  }
+        count += 16 + count_ones(surrogates_bitmask);
+    }
 
-  return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
+    return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return implementation::count_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return implementation::count_utf8(input, length);
 }
 
 } // namespace icelake
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/end.h
 /* begin file src/simdutf/icelake/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 // nothing needed.
@@ -19194,7 +22154,6 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -19202,10 +22161,10 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* end file src/icelake/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/implementation.cpp
 /* begin file src/haswell/implementation.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/begin.h
 /* begin file src/simdutf/haswell/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
 // #define SIMDUTF_IMPLEMENTATION haswell
@@ -19217,7 +22176,7 @@ SIMDUTF_TARGET_HASWELL
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/haswell/begin.h */
 namespace simdutf {
@@ -19228,31 +22187,34 @@ namespace {
 #endif
 using namespace simd;
 
-
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
-  return input.reduce_or().is_ascii();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
+    return input.reduce_or().is_ascii();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
 /* begin file src/haswell/avx2_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int avx2_detect_encodings(const char * buf, size_t len) {
+int avx2_detect_encodings(const char* buf, size_t len)
+{
     const char* start = buf;
     const char* end = buf + len;
 
@@ -19267,11 +22229,11 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 
     __m256i currentmax = _mm256_setzero_si256();
 
-    checker check{};
+    checker check {};
 
-    while(buf + 64 <= end) {
+    while (buf + 64 <= end) {
         __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(nextin);
@@ -19297,15 +22259,15 @@ int avx2_detect_encodings(const char * buf, size_t len) {
             if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
                 is_utf32 = false;
                 // Code from avx2_validate_utf16le.cpp
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
+                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint32_t V0 = ~surrogates_bitmask0;
 
-                const auto    vH0 = (in16 & v_fc) == v_dc;
+                const auto vH0 = (in16 & v_fc) == v_dc;
                 const uint32_t H0 = vH0.to_bitmask();
 
                 const uint32_t L0 = ~H0 & surrogates_bitmask0;
@@ -19338,7 +22300,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
                     } else {
                         const uint32_t V = ~surrogates_bitmask;
 
-                        const auto    vH = (in_16 & v_fc) == v_dc;
+                        const auto vH = (in_16 & v_fc) == v_dc;
                         const uint32_t H = vH.to_bitmask();
 
                         const uint32_t L = ~H & surrogates_bitmask;
@@ -19362,8 +22324,8 @@ int avx2_detect_encodings(const char * buf, size_t len) {
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
+                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
 
                     // Must start checking for surrogates
                     __m256i currentoffsetmax = _mm256_setzero_si256();
@@ -19377,14 +22339,14 @@ int avx2_detect_encodings(const char * buf, size_t len) {
                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
 
                     while (input + 8 < end32) {
-                        const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
-                        currentmax = _mm256_max_epu32(in32,currentmax);
+                        const __m256i in32 = _mm256_loadu_si256((__m256i*)input);
+                        currentmax = _mm256_max_epu32(in32, currentmax);
                         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
                         input += 8;
                     }
 
                     __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
+                    if (_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
                         return simdutf::encoding_type::unspecified;
                     }
                 } else {
@@ -19411,7 +22373,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
+            uint8_t block[64] {};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -19422,14 +22384,14 @@ int avx2_detect_encodings(const char * buf, size_t len) {
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -19438,7 +22400,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 }
 /* end file src/haswell/avx2_detect_encodings.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_validate_utf16.cpp
 /* begin file src/haswell/avx2_validate_utf16.cpp */
 /*
     In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
@@ -19485,8 +22447,9 @@ int avx2_detect_encodings(const char * buf, size_t len) {
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-template <endianness big_endian>
-const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const char16_t* avx2_validate_utf16(const char16_t* input, size_t size)
+{
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -19528,19 +22491,19 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
             const uint32_t V = ~surrogates_bitmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint32_t H = vH.to_bitmask();
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint32_t L = ~H & surrogates_bitmask;
 
-            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
+            const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
+            const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
+                                       // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
             if (c == 0xffffffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -19561,9 +22524,9 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
     return input;
 }
 
-
-template <endianness big_endian>
-const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
+{
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -19606,19 +22569,19 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint32_t V = ~surrogates_bitmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint32_t H = vH.to_bitmask();
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint32_t L = ~H & surrogates_bitmask;
 
-            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
+            const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
+            const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
+                                       // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
             if (c == 0xffffffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -19639,13 +22602,14 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
     return result(error_code::SUCCESS, input - start);
 }
 /* end file src/haswell/avx2_validate_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp
 /* begin file src/haswell/avx2_validate_utf32le.cpp */
 /* Returns:
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
+const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size)
+{
     const char32_t* end = input + size;
 
     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
@@ -19655,26 +22619,26 @@ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
     __m256i currentoffsetmax = _mm256_setzero_si256();
 
     while (input + 8 < end) {
-        const __m256i in = _mm256_loadu_si256((__m256i *)input);
-        currentmax = _mm256_max_epu32(in,currentmax);
+        const __m256i in = _mm256_loadu_si256((__m256i*)input);
+        currentmax = _mm256_max_epu32(in, currentmax);
         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
         input += 8;
     }
     __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-    if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     return input;
 }
 
-
-const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size)
+{
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -19685,17 +22649,17 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz
     __m256i currentoffsetmax = _mm256_setzero_si256();
 
     while (input + 8 < end) {
-        const __m256i in = _mm256_loadu_si256((__m256i *)input);
-        currentmax = _mm256_max_epu32(in,currentmax);
+        const __m256i in = _mm256_loadu_si256((__m256i*)input);
+        currentmax = _mm256_max_epu32(in, currentmax);
         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
 
         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 0) {
             return result(error_code::SURROGATE, input - start);
         }
         input += 8;
@@ -19705,303 +22669,286 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz
 }
 /* end file src/haswell/avx2_validate_utf32le.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
 /* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    __m256i ascii = _mm256_cvtepu8_epi16(in);
-    if (big_endian) {
-      const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      ascii = _mm256_shuffle_epi8(ascii, swap256);
-    }
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-    return 12;
-  }
-
-  const uint8_t idx =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    const __m128i composedminus =
-        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-    const __m128i lowtenbits =
-        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-    const __m128i hightenbits = _mm_srli_epi32(composedminus, 10);
-    const __m128i lowtenbitsadd =
-        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-    const __m128i hightenbitsadd =
-        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-    __m128i surrogates =
-        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (big_endian) {
-      _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
-      surrogates = _mm_shuffle_epi8(surrogates, swap);
-    }
-    _mm_storeu_si128((__m128i *)basic_buffer, composed);
-    uint32_t surrogate_buffer[4];
-    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
-    for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] < 65536) {
-        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-        utf16_output++;
-      } else {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-        utf16_output += 2;
-      }
-    }
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+template<endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char16_t*& utf16_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        __m256i ascii = _mm256_cvtepu8_epi16(in);
+        if (big_endian) {
+            const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            ascii = _mm256_shuffle_epi8(ascii, swap256);
+        }
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf16_output), ascii);
+        utf16_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4;
+        return 12;
+    }
+
+    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4; // Here we overflow by 8 bytes.
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        //////////////
+        // There might be garbage inputs where a leading byte mascarades as a four-byte
+        // leading byte (by being followed by 3 continuation byte), but is not greater than
+        // 0xf0. This could trigger a buffer overflow if we only counted leading
+        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+        // We do as at the cost of an extra mask.
+        /////////////
+        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        // We deliberately carry the leading four bits in highbyte if they are present,
+        // we remove them later when computing hightenbits.
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        // When we need to generate a surrogate pair (leading byte > 0xF0), then
+        // the corresponding 32-bit value in 'composed'  will be greater than
+        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+        // location of the surrogate pairs.
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        const __m128i composedminus = _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+        const __m128i lowtenbits = _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+        // Notice the 0x3ff mask:
+        const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+        const __m128i lowtenbitsadd = _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+        const __m128i hightenbitsadd = _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+        const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+        __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+        uint32_t basic_buffer[4];
+        uint32_t basic_buffer_swap[4];
+        if (big_endian) {
+            _mm_storeu_si128((__m128i*)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
+            surrogates = _mm_shuffle_epi8(surrogates, swap);
+        }
+        _mm_storeu_si128((__m128i*)basic_buffer, composed);
+        uint32_t surrogate_buffer[4];
+        _mm_storeu_si128((__m128i*)surrogate_buffer, surrogates);
+        for (size_t i = 0; i < 3; i++) {
+            if (basic_buffer[i] > 0x3c00000) {
+                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+                utf16_output += 2;
+            } else {
+                utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+                utf16_output++;
+            }
+        }
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
 /* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char32_t *&utf32_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
-    utf32_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 3;
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+size_t convert_masked_utf8_to_utf32(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char32_t*& utf32_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu8_epi32(in));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+        utf32_output += 16; // We wrote 16 32-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm256_storeu_si256((__m256i*)utf32_output, _mm256_cvtepu16_epi32(composed));
+        utf32_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm256_storeu_si256((__m256i*)utf32_output, _mm256_cvtepu16_epi32(composed));
+        utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+        // overflow of 32 - 24 = 8 bytes.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
 /* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -20052,489 +22999,493 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     - We need two 256-entry tables that have 8704 bytes in total.
 */
 
-
 /*
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
-  const char16_t* end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m256i t0 = _mm256_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m256i t2 = _mm256_and_si256(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m256i t3 = _mm256_or_si256(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          const uint32_t M0 = one_byte_bitmask & 0x55555555;
-          const uint32_t M1 = M0 >> 7;
-          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-          // 4. pack the bytes
-
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-          utf8_output += row[0];
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-          utf8_output += row_2[0];
-
-          // 6. adjust pointers
-          buf += 16;
-          continue;
-    }
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                                0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m256i s0 = _mm256_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-        // Due to the wider registers, the following path is less likely to be useful.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-          utf8_output += 12;
-          buf += 16;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-        utf8_output += row2[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-        utf8_output += row3[0];
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+template<endianness big_endian>
+std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const char16_t* end = buf + len;
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+    const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+        if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, utf8_output);
+    } // while
+    return std::make_pair(buf, utf8_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m256i t0 = _mm256_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m256i t2 = _mm256_and_si256(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m256i t3 = _mm256_or_si256(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          const uint32_t M0 = one_byte_bitmask & 0x55555555;
-          const uint32_t M1 = M0 >> 7;
-          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-          // 4. pack the bytes
-
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-          utf8_output += row[0];
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-          utf8_output += row_2[0];
-
-          // 6. adjust pointers
-          buf += 16;
-          continue;
-    }
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                                0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m256i s0 = _mm256_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-        // Due to the wider registers, the following path is less likely to be useful.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-          utf8_output += 12;
-          buf += 16;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-        utf8_output += row2[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-        utf8_output += row3[0];
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+template<endianness big_endian>
+std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+    const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+        if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
 /* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -20585,760 +23536,793 @@ std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t*
     - We need two 256-entry tables that have 8704 bytes in total.
 */
 
-
 /*
   Returns a pair: the first unprocessed byte from buf and utf32_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit words to sixteen 32-bit words
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
-        utf32_output += 16;
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          // No surrogate pair
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* end = buf + len;
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: we extend all sixteen 16-bit words to sixteen 32-bit words
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+            utf32_output += 16;
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    // No surrogate pair
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, utf32_output);
+    } // while
+    return std::make_pair(buf, utf32_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit words to sixteen 32-bit words
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
-        utf32_output += 16;
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          // No surrogate pair
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: we extend all sixteen 16-bit words to sixteen 32-bit words
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+            utf32_output += 16;
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    // No surrogate pair
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
 /* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
 /* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
-
-  return std::make_pair(buf, utf8_output);
-}
-
-
-std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const char32_t* start = buf;
-
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    // Check for too large input
-    const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-      // Check for illegal surrogate words
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-      }
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    __m256i running_max = _mm256_setzero_si256();
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    // check for invalid input
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    return std::make_pair(buf, utf8_output);
+}
+
+std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const char32_t* start = buf;
+
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        // Check for too large input
+        const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+        if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+            // Check for illegal surrogate words
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+            }
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
 /* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* end = buf + len;
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* end = buf + len;
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
 
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
+    // check for invalid input
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf16_output);
+    }
 
-  return std::make_pair(buf, utf16_output);
+    return std::make_pair(buf, utf16_output);
 }
 
+template<endianness big_endian>
+std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
 
-template <endianness big_endian>
-std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
-
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-      }
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+            }
+
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace haswell {
@@ -21348,92 +24332,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace haswell {
@@ -21442,21 +24444,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -21464,101 +24467,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -21569,51 +24563,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -21622,7 +24619,7 @@ using utf8_validation::utf8_checker;
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace haswell {
@@ -21633,15 +24630,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -21650,97 +24648,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -21749,10 +24756,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
@@ -21760,63 +24766,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -21824,32 +24831,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -21857,258 +24863,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -22118,68 +25147,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -22187,251 +25214,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -22441,36 +25490,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -22478,64 +25528,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -22547,466 +25605,667 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace haswell {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
     } else {
-      return simdutf::encoding_type::unspecified;
+        if (implementation::validate_utf8(input, length)) {
+            return simdutf::encoding_type::UTF8;
+        } else {
+            return simdutf::encoding_type::unspecified;
+        }
     }
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8(buf,len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii(buf,len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
+}
+
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = avx2_validate_utf32le(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    result res = avx2_validate_utf32le_with_errors(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = avx2_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    char32_t* utf32_output) const noexcept
+{
+    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
     }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
     }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
-  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
-    const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
-    const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
-    const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
-    const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
-    const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
-  }
-  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
-    const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4;
-    count += 8 + surrogate_count;
-  }
-  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+    const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 8 <= length; pos += 8) {
+        __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
+        const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+        const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+        const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+        const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+        const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+        const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+        const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+
+        size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+        size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+        size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+        count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    }
+    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 8 <= length; pos += 8) {
+        __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
+        const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+        size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
+        count += 8 + surrogate_count;
+    }
+    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace haswell
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/end.h
 /* begin file src/simdutf/haswell/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
 // nothing needed.
@@ -23014,7 +26273,6 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -23022,14 +26280,10 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* end file src/haswell/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=ppc64/implementation.cpp
 /* begin file src/ppc64/implementation.cpp */
 
-
-
-
-
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/begin.h
 /* begin file src/simdutf/ppc64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
 // #define SIMDUTF_IMPLEMENTATION ppc64
@@ -23042,32 +26296,34 @@ namespace {
 #endif
 using namespace simd;
 
-
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
-  // careful: 0x80 is not ascii.
-  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
+    // careful: 0x80 is not ascii.
+    return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace ppc64 {
@@ -23077,92 +26333,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace ppc64 {
@@ -23171,21 +26445,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -23193,101 +26468,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -23298,51 +26564,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -23351,7 +26620,7 @@ using utf8_validation::utf8_checker;
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace ppc64 {
@@ -23362,15 +26631,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -23379,97 +26649,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -23478,10 +26757,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace ppc64 {
 namespace {
@@ -23489,63 +26767,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -23553,32 +26832,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -23586,258 +26864,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -23847,68 +27148,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -23916,251 +27215,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -24170,36 +27491,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -24207,64 +27529,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -24279,242 +27609,303 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace ppc64 {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  int out = 0;
-  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
-  if((length % 2) == 0) {
-    if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
-  }
-  if((length % 4) == 0) {
-    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
-  }
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    int out = 0;
+    if (validate_utf8(input, length)) {
+        out |= encoding_type::UTF8;
+    }
+    if ((length % 2) == 0) {
+        if (validate_utf16(reinterpret_cast<const char16_t*>(input), length / 2)) {
+            out |= encoding_type::UTF16_LE;
+        }
+    }
+    if ((length % 4) == 0) {
+        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
+            out |= encoding_type::UTF32_LE;
+        }
+    }
 
-  return out;
+    return out;
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8(buf,len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii(buf,len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::BIG>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    return scalar::utf32::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate(buf, len);
+simdutf_warn_unused bool implementation::validate_utf32(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf32::validate(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
+{
+    return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    scalar::utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf8_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf16_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace ppc64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/end.h
 /* begin file src/simdutf/ppc64/end.h */
 /* end file src/simdutf/ppc64/end.h */
 /* end file src/ppc64/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/implementation.cpp
 /* begin file src/westmere/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/begin.h
 /* begin file src/simdutf/westmere/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
 // #define SIMDUTF_IMPLEMENTATION westmere
@@ -24533,30 +27924,34 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
-  return input.reduce_or().is_ascii();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
+    return input.reduce_or().is_ascii();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_detect_encodings.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_detect_encodings.cpp
 /* begin file src/westmere/sse_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int sse_detect_encodings(const char * buf, size_t len) {
+int sse_detect_encodings(const char* buf, size_t len)
+{
     const char* start = buf;
     const char* end = buf + len;
 
@@ -24571,13 +27966,13 @@ int sse_detect_encodings(const char * buf, size_t len) {
 
     __m128i currentmax = _mm_setzero_si128();
 
-    checker check{};
+    checker check {};
 
-    while(buf + 64 <= end) {
+    while (buf + 64 <= end) {
         __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
-        __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-        __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
+        __m128i secondin = _mm_loadu_si128((__m128i*)buf + 1);
+        __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
+        __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(secondin);
@@ -24611,15 +28006,15 @@ int sse_detect_encodings(const char * buf, size_t len) {
                 is_utf32 = false;
                 // Code from sse_validate_utf16le.cpp
                 // Not efficient, we do not process surrogates_bitmask1
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
+                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
 
-                const auto    vH0 = (in16 & v_fc) == v_dc;
+                const auto vH0 = (in16 & v_fc) == v_dc;
                 const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
 
                 const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
@@ -24655,7 +28050,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
                     } else {
                         const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
-                        const auto    vH = (in_16 & v_fc) == v_dc;
+                        const auto vH = (in_16 & v_fc) == v_dc;
                         const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
                         const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
@@ -24680,8 +28075,8 @@ int sse_detect_encodings(const char * buf, size_t len) {
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
+                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
 
                     // Must start checking for surrogates
                     __m128i currentoffsetmax = _mm_setzero_si128();
@@ -24699,14 +28094,14 @@ int sse_detect_encodings(const char * buf, size_t len) {
                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
 
                     while (input + 4 < end32) {
-                        const __m128i in32 = _mm_loadu_si128((__m128i *)input);
-                        currentmax = _mm_max_epu32(in32,currentmax);
+                        const __m128i in32 = _mm_loadu_si128((__m128i*)input);
+                        currentmax = _mm_max_epu32(in32, currentmax);
                         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
                         input += 4;
                     }
 
                     __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
+                    if (_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
                         is_utf32 = false;
                     }
                 } else {
@@ -24735,7 +28130,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
+            uint8_t block[64] {};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -24746,14 +28141,14 @@ int sse_detect_encodings(const char * buf, size_t len) {
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const __m128i standardmax = _mm_set1_epi32(0x10ffff);
         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
+        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -24762,7 +28157,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
 }
 /* end file src/westmere/sse_detect_encodings.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_validate_utf16.cpp
 /* begin file src/westmere/sse_validate_utf16.cpp */
 /*
     In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
@@ -24809,8 +28204,9 @@ int sse_detect_encodings(const char * buf, size_t len) {
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-template <endianness big_endian>
-const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const char16_t* sse_validate_utf16(const char16_t* input, size_t size)
+{
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -24851,19 +28247,19 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
+                                                                    // (A low surrogate placed in the 7th register's word
+                                                                    // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
+                                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
 
             if (c == 0xffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -24884,9 +28280,9 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
     return input;
 }
 
-
-template <endianness big_endian>
-const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
+{
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -24929,19 +28325,19 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
+                                                                    // (A low surrogate placed in the 7th register's word
+                                                                    // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
+                                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
 
             if (c == 0xffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -24962,13 +28358,14 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
     return result(error_code::SUCCESS, input - start);
 }
 /* end file src/westmere/sse_validate_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf32le.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_validate_utf32le.cpp
 /* begin file src/westmere/sse_validate_utf32le.cpp */
 /* Returns:
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
+const char32_t* sse_validate_utf32le(const char32_t* input, size_t size)
+{
     const char32_t* end = input + size;
 
     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
@@ -24978,26 +28375,26 @@ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
     __m128i currentoffsetmax = _mm_setzero_si128();
 
     while (input + 4 < end) {
-        const __m128i in = _mm_loadu_si128((__m128i *)input);
-        currentmax = _mm_max_epu32(in,currentmax);
+        const __m128i in = _mm_loadu_si128((__m128i*)input);
+        currentmax = _mm_max_epu32(in, currentmax);
         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
         input += 4;
     }
     __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-    if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     return input;
 }
 
-
-const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size)
+{
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -25008,17 +28405,17 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size
     __m128i currentoffsetmax = _mm_setzero_si128();
 
     while (input + 4 < end) {
-        const __m128i in = _mm_loadu_si128((__m128i *)input);
-        currentmax = _mm_max_epu32(in,currentmax);
+        const __m128i in = _mm_loadu_si128((__m128i*)input);
+        currentmax = _mm_max_epu32(in, currentmax);
         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
 
         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+        if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+        if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
             return result(error_code::SURROGATE, input - start);
         }
         input += 4;
@@ -25028,309 +28425,291 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size
 }
 /* end file src/westmere/sse_validate_utf32le.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
 /* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    __m128i ascii_first = _mm_cvtepu8_epi16(in);
-    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
-    if (big_endian) {
-      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
-      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
-    }
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    const __m128i composedminus =
-        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-    const __m128i lowtenbits =
-        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-    const __m128i hightenbits = _mm_srli_epi32(composedminus, 10);
-    const __m128i lowtenbitsadd =
-        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-    const __m128i hightenbitsadd =
-        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-    __m128i surrogates =
-        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (big_endian) {
-      _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
-      surrogates = _mm_shuffle_epi8(surrogates, swap);
-    }
-    _mm_storeu_si128((__m128i *)basic_buffer, composed);
-    uint32_t surrogate_buffer[4];
-    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
-    for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] < 65536) {
-        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-        utf16_output++;
-      } else {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-        utf16_output += 2;
-      }
-    }
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+template<endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char16_t*& utf16_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        __m128i ascii_first = _mm_cvtepu8_epi16(in);
+        __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
+        if (big_endian) {
+            ascii_first = _mm_shuffle_epi8(ascii_first, swap);
+            ascii_second = _mm_shuffle_epi8(ascii_second, swap);
+        }
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output), ascii_first);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + 8), ascii_second);
+        utf16_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        //////////////
+        // There might be garbage inputs where a leading byte mascarades as a four-byte
+        // leading byte (by being followed by 3 continuation byte), but is not greater than
+        // 0xf0. This could trigger a buffer overflow if we only counted leading
+        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+        // We do as at the cost of an extra mask.
+        /////////////
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        // We deliberately carry the leading four bits in highbyte if they are present,
+        // we remove them later when computing hightenbits.
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        // When we need to generate a surrogate pair (leading byte > 0xF0), then
+        // the corresponding 32-bit value in 'composed'  will be greater than
+        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+        // location of the surrogate pairs.
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        const __m128i composedminus = _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+        const __m128i lowtenbits = _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+        // Notice the 0x3ff mask:
+        const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+        const __m128i lowtenbitsadd = _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+        const __m128i hightenbitsadd = _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+        const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+        __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+        uint32_t basic_buffer[4];
+        uint32_t basic_buffer_swap[4];
+        if (big_endian) {
+            _mm_storeu_si128((__m128i*)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
+            surrogates = _mm_shuffle_epi8(surrogates, swap);
+        }
+        _mm_storeu_si128((__m128i*)basic_buffer, composed);
+        uint32_t surrogate_buffer[4];
+        _mm_storeu_si128((__m128i*)surrogate_buffer, surrogates);
+        for (size_t i = 0; i < 3; i++) {
+            if (basic_buffer[i] > 0x3c00000) {
+                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+                utf16_output += 2;
+            } else {
+                utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+                utf16_output++;
+            }
+        }
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
 /* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char32_t *&utf32_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
-    utf32_output += 8; // We wrote 32 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 3;
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+size_t convert_masked_utf8_to_utf32(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char32_t*& utf32_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu8_epi32(in));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 8), _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 12), _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
+        utf32_output += 16; // We wrote 16 32-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(composed));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+        utf32_output += 8; // We wrote 32 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(composed));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+        utf32_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 3;
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
 /* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -25385,480 +28764,485 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
+template<endianness big_endian>
+std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output)
+{
 
-  const char16_t* end = buf + len;
+    const char16_t* end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+    while (buf + 16 + safety_margin <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
         if (big_endian) {
-          const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          nextin = _mm_shuffle_epi8(nextin, swap);
-        }
-        if(!_mm_testz_si128(nextin, v_ff80)) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,in);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m128i t0 = _mm_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m128i t1 = _mm_and_si128(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m128i t2 = _mm_and_si128(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m128i t3 = _mm_or_si128(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m128i s0 = _mm_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                              (one_or_two_bytes_bitmask & 0xaaaa);
-        if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+        if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+            __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                nextin = _mm_shuffle_epi8(nextin, swap);
+            }
+            if (!_mm_testz_si128(nextin, v_ff80)) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, in);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        // no bits set above 7th bit
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(buf, utf8_output);
+    return std::make_pair(buf, utf8_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+template<endianness big_endian>
+std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
         if (big_endian) {
-          const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          nextin = _mm_shuffle_epi8(nextin, swap);
-        }
-        if(!_mm_testz_si128(nextin, v_ff80)) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,in);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m128i t0 = _mm_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m128i t1 = _mm_and_si128(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m128i t2 = _mm_and_si128(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m128i t3 = _mm_or_si128(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m128i s0 = _mm_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                              (one_or_two_bytes_bitmask & 0xaaaa);
-        if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+        if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+            __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                nextin = _mm_shuffle_epi8(nextin, swap);
+            }
+            if (!_mm_testz_si128(nextin, v_ff80)) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, in);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        // no bits set above 7th bit
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
 /* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -25913,754 +29297,816 @@ std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* b
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* end = buf + len;
+template<endianness big_endian>
+std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* end = buf + len;
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-  while (buf + 16 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    while (buf + 16 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
 
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit words to 32-bit words
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
-        utf32_output += 8;
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+        if (big_endian) {
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: no surrogate pair, extend 16-bit words to 32-bit words
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(in));
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, utf32_output);
+    } // while
+    return std::make_pair(buf, utf32_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
+template<endianness big_endian>
+std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    while (buf + 16 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
 
-  while (buf + 16 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
+        if (big_endian) {
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
 
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit words to 32-bit words
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
-        utf32_output += 8;
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: no surrogate pair, extend 16-bit words to 32-bit words
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(in));
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
 /* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
 /* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
-
-  const char32_t* end = buf + len;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-  __m128i running_max = _mm_setzero_si128();
-  __m128i forbidden_bytemask = _mm_setzero_si128();
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-    running_max = _mm_max_epu32(_mm_max_epu32(in, running_max), nextin);
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-    // Check for ASCII fast path
-    if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
-      running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);
-      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
-      if(!_mm_testz_si128(nextin_16, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        // Proceed with next input
-        in_16 = nextin_16;
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 = _mm_and_si128(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-      const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-
-    // Check for overflow in packing
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
-
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                            (one_or_two_bytes_bitmask & 0xaaaa);
-      if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-
-      buf += 8;
-    } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xFFFF0000 )==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-  if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
-
-  return std::make_pair(buf, utf8_output);
-}
-
-
-std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-
-  const char32_t* end = buf + len;
-  const char32_t* start = buf;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-
-    // Check for too large input
-    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
-    if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-    // Check for ASCII fast path
-    if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
-      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
-      if(!_mm_testz_si128(nextin_16, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        // Proceed with next input
-        in_16 = nextin_16;
-        __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
-        if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-        }
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 = _mm_and_si128(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-      const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-
-    // Check for overflow in packing
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-      // Check for illegal surrogate words
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-      }
-
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                            (one_or_two_bytes_bitmask & 0xaaaa);
-      if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-
-      buf += 8;
-    } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xFFFF0000 )==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+
+    const __m128i v_0000 = _mm_setzero_si128(); //__m128 = 128 bits
+    const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000 0000
+    const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000 0000
+    const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000 0000
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
+    const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
+    __m128i running_max = _mm_setzero_si128();
+    __m128i forbidden_bytemask = _mm_setzero_si128();
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) { // buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
+        // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1); // These two values can hold only 8 UTF32 chars
+        running_max = _mm_max_epu32(
+            _mm_max_epu32(in, running_max), // take element-wise max char32_t from in and running_max vector
+            nextin); // and take element-wise max element from nextin and running_max vector
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m128i in_16 = _mm_packus_epi32(
+            _mm_and_si128(in, v_7fffffff),
+            _mm_and_si128(nextin, v_7fffffff)); // in this context pack the two __m128 into a single
+        // By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points.
+        // remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK.
+
+        // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+        // Check for ASCII fast path
+
+        // ASCII fast path!!!!
+        // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+        // The intuition is that we try to collect 16 ASCII characters which requires
+        // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
+        // as our new inputs.
+        if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
+            __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
+            __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
+            running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin); // take the running max of all 4 vectors thus far
+            __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); // pack into 1 vector, now you have two
+            if (!_mm_testz_si128(nextin_16, v_ff80)) { // checks if the second packed vector is ASCII, if not:
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16); // creates two copy of in_16 in 1 vector
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); // put them into the output
+                // 3. adjust pointers
+                buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32 bits =  256 bits
+                utf8_output += 8; // same with output, e.g. lift the first two blocks alone.
+                // Proceed with next input
+                in_16 = nextin_16;
+                // We need to update in and nextin because they are used later.
+                in = thirdin;
+                nextin = fourthin;
+            } else {
+                // 1. pack the bytes
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        // no bits set above 7th bit -- find out all the ASCII characters
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
+            _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units
+            v_0000 //
+        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX
+        // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in_16, v_003f); // potential second utf8 byte
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // Check for overflow in packing
+
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
+
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+                1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
         } else {
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
+    } // while
+
+    // check for invalid input
+    const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+    if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
+        return std::make_pair(nullptr, utf8_output);
     }
-  } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+    if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    return std::make_pair(buf, utf8_output);
 }
-/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
-/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
 
-  const char32_t* end = buf + len;
+std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
-  __m128i forbidden_bytemask = _mm_setzero_si128();
+    const char32_t* end = buf + len;
+    const char32_t* start = buf;
 
-  while (buf + 8 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+    const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
+    const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+    const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
+    const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
 
-    // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+    while (buf + 16 + safety_margin <= end) {
+        // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
 
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
+        // Check for too large input
+        __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
+        if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
+            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
 
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
+
+        // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+        // Check for ASCII fast path
+        if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
+            // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+            // The intuition is that we try to collect 16 ASCII characters which requires
+            // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
+            // as our new inputs.
+            __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
+            __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
+            __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
+            if (!_mm_testz_si128(nextin_16, v_ff80)) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                // Proceed with next input
+                in_16 = nextin_16;
+                __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
+                if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
+                    return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+                }
+                // We need to update in and nextin because they are used later.
+                in = thirdin;
+                nextin = fourthin;
+            } else {
+                // 1. pack the bytes
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
         }
-      }
-      buf += k;
-    }
-  }
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
+        // no bits set above 7th bit
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // Check for overflow in packing
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-  return std::make_pair(buf, utf16_output);
+        if (saturation_bitmask == 0xffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+            // Check for illegal surrogate words
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+            }
+
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+                1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+        } else {
+            // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
+/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
+/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
 
+    const char32_t* end = buf + len;
 
-template <endianness big_endian>
-std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+    __m128i forbidden_bytemask = _mm_setzero_si128();
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+    while (buf + 8 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+        // Check if no bits set above 16th
+        if (saturation_bitmask == 0xffff) {
+            // Pack UTF-32 to UTF-16
+            __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+            const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
 
-  while (buf + 8 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
+        } else {
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
+        }
+    }
 
-    // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+    // check for invalid input
+    if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf16_output);
+    }
 
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-      }
+    return std::make_pair(buf, utf16_output);
+}
 
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
+template<endianness big_endian>
+std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
 
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+
+    while (buf + 8 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+        // Check if no bits set above 16th
+        if (saturation_bitmask == 0xffff) {
+            // Pack UTF-32 to UTF-16
+            __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+            const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+            }
+
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
 
@@ -26668,7 +30114,7 @@ std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32
 } // namespace westmere
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace westmere {
@@ -26678,92 +30124,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace westmere {
@@ -26772,21 +30236,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -26794,101 +30259,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -26899,51 +30355,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -26952,7 +30411,7 @@ using utf8_validation::utf8_checker;
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace westmere {
@@ -26963,15 +30422,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -26980,97 +30440,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -27079,10 +30548,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace westmere {
 namespace {
@@ -27090,63 +30558,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -27154,32 +30623,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -27187,258 +30655,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -27448,68 +30939,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -27517,251 +31006,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -27771,36 +31282,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -27808,64 +31320,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -27880,467 +31400,667 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace westmere {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
     } else {
-      return simdutf::encoding_type::unspecified;
+        if (implementation::validate_utf8(input, length)) {
+            return simdutf::encoding_type::UTF8;
+        } else {
+            return simdutf::encoding_type::unspecified;
+        }
     }
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = sse_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = sse_validate_utf32le(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    result res = sse_validate_utf32le_with_errors(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    char32_t* utf32_output) const noexcept
+{
+    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
     }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
     }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
-  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
-    const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
-    const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
-    const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
-    const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
-    const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::BIG>(input, length);
+}
 
-    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
-  }
-  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
-    const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4;
-    count += 4 + surrogate_count;
-  }
-  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m128i v_00000000 = _mm_setzero_si128();
+    const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
+    const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
+        const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
+        const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
+        const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
+        const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+        const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+        const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
+        const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
+        const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
+
+        size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+        size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+        size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+        count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    }
+    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m128i v_00000000 = _mm_setzero_si128();
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
+        const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+        const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
+        size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+        count += 4 + surrogate_count;
+    }
+    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace westmere
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/end.h
 /* begin file src/simdutf/westmere/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
diff --git a/src/bun.js/bindings/simdutf.h b/src/bun.js/bindings/simdutf.h
index 0a57a69f7..7fb388e9e 100644
--- a/src/bun.js/bindings/simdutf.h
+++ b/src/bun.js/bindings/simdutf.h
@@ -1,11 +1,11 @@
-/* auto-generated on 2023-02-10 14:42:58 -0500. Do not edit! */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf.h
+/* auto-generated on 2023-06-21 08:09:45 -0400. Do not edit! */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf.h
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
 #include <cstring>
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/compiler_check.h
 /* begin file include/simdutf/compiler_check.h */
 #ifndef SIMDUTF_COMPILER_CHECK_H
 #define SIMDUTF_COMPILER_CHECK_H
@@ -43,13 +43,13 @@
 
 #endif // SIMDUTF_COMPILER_CHECK_H
 /* end file include/simdutf/compiler_check.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/common_defs.h
 /* begin file include/simdutf/common_defs.h */
 #ifndef SIMDUTF_COMMON_DEFS_H
 #define SIMDUTF_COMMON_DEFS_H
 
 #include <cassert>
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/portability.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/portability.h
 /* begin file include/simdutf/portability.h */
 #ifndef SIMDUTF_PORTABILITY_H
 #define SIMDUTF_PORTABILITY_H
@@ -144,6 +144,8 @@
 // POWER processors. Please see https://github.com/lemire/simdutf/issues/51
 #elif defined(__s390__)
 // s390 IBM system. Big endian.
+#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
+// RISC-V 64-bit
 #else
 // The simdutf library is designed
 // for 64-bit processors and it seems that you are not
@@ -278,7 +280,7 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif // SIMDUTF_PORTABILITY_H
 /* end file include/simdutf/portability.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/avx512.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/avx512.h
 /* begin file include/simdutf/avx512.h */
 #ifndef SIMDUTF_AVX512_H_
 #define SIMDUTF_AVX512_H_
@@ -458,19 +460,21 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif // MSC_VER
 
-#if defined(SIMDUTF_VISUAL_STUDIO)
-    /**
-     * It does not matter here whether you are using
-     * the regular visual studio or clang under visual
-     * studio.
-     */
-    #if SIMDUTF_USING_LIBRARY
-    #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+#ifndef SIMDUTF_DLLIMPORTEXPORT
+    #if defined(SIMDUTF_VISUAL_STUDIO)
+      /**
+       * It does not matter here whether you are using
+       * the regular visual studio or clang under visual
+       * studio.
+       */
+      #if SIMDUTF_USING_LIBRARY
+      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+      #else
+      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+      #endif
     #else
-    #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+      #define SIMDUTF_DLLIMPORTEXPORT
     #endif
-#else
-    #define SIMDUTF_DLLIMPORTEXPORT
 #endif
 
 /// If EXPR is an error, returns it.
@@ -479,7 +483,7 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif // SIMDUTF_COMMON_DEFS_H
 /* end file include/simdutf/common_defs.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/encoding_types.h
 /* begin file include/simdutf/encoding_types.h */
 #include <string>
 
@@ -491,6 +495,7 @@ enum encoding_type {
         UTF16_BE = 4,   // BOM 0xfe 0xff
         UTF32_LE = 8,   // BOM 0xff 0xfe 0x00 0x00
         UTF32_BE = 16,   // BOM 0x00 0x00 0xfe 0xff
+        Latin1 = 32,
 
         unspecified = 0
 };
@@ -527,7 +532,7 @@ size_t bom_byte_size(encoding_type bom);
 } // BOM namespace
 } // simdutf namespace
 /* end file include/simdutf/encoding_types.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/error.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/error.h
 /* begin file include/simdutf/error.h */
 #ifndef ERROR_H
 #define ERROR_H
@@ -541,9 +546,10 @@ enum error_code {
   TOO_LONG,     // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
   OVERLONG,     // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
                 // and U+FFFF for four-byte characters.
-  TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII.
+  TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1
   SURROGATE,    // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
-                // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16)
+                // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
+                // there must be no surrogate at all (Latin1)
   OTHER         // Not related to validation/transcoding.
 };
 
@@ -564,7 +570,7 @@ SIMDUTF_PUSH_DISABLE_WARNINGS
 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 
 // Public API
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/simdutf_version.h
 /* begin file include/simdutf/simdutf_version.h */
 // /include/simdutf/simdutf_version.h automatically generated by release.py,
 // do not change by hand
@@ -572,7 +578,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "3.2.0"
+#define SIMDUTF_VERSION "3.2.14"
 
 namespace simdutf {
 enum {
@@ -587,13 +593,13 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 0
+  SIMDUTF_VERSION_REVISION = 14
 };
 } // namespace simdutf
 
 #endif // SIMDUTF_SIMDUTF_VERSION_H
 /* end file include/simdutf/simdutf_version.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/implementation.h
 /* begin file include/simdutf/implementation.h */
 #ifndef SIMDUTF_IMPLEMENTATION_H
 #define SIMDUTF_IMPLEMENTATION_H
@@ -603,7 +609,7 @@ enum {
 #endif
 #include <vector>
 #include <tuple>
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/internal/isadetection.h
 /* begin file include/simdutf/internal/isadetection.h */
 /* From
 https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
@@ -690,22 +696,12 @@ static inline uint32_t detect_supported_architectures() {
   return instruction_set::ALTIVEC;
 }
 
-#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
-
-#if defined(__ARM_NEON)
+#elif defined(__aarch64__) || defined(_M_ARM64)
 
 static inline uint32_t detect_supported_architectures() {
   return instruction_set::NEON;
 }
 
-#else // ARM without NEON
-
-static inline uint32_t detect_supported_architectures() {
-  return instruction_set::DEFAULT;
-}
-
-#endif
-
 #elif defined(__x86_64__) || defined(_M_AMD64) // x64
 
 
@@ -716,6 +712,7 @@ namespace cpuid_bit {
     // EAX = 0x01
     constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 of ECX for EAX=0x1
     constexpr uint32_t sse42 = uint32_t(1) << 20;    ///< @private bit 20 of ECX for EAX=0x1
+    constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
 
     // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
     // See: "Table 3-8. Information Returned by CPUID Instruction"
@@ -741,6 +738,10 @@ namespace cpuid_bit {
     namespace edx {
       constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
     }
+    namespace xcr0_bit {
+     constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
+     constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
+   }
   }
 }
 
@@ -750,7 +751,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
                          uint32_t *edx) {
 #if defined(_MSC_VER)
   int cpu_info[4];
-  __cpuid(cpu_info, *eax);
+  __cpuidex(cpu_info, *eax, *ecx);
   *eax = cpu_info[0];
   *ebx = cpu_info[1];
   *ecx = cpu_info[2];
@@ -768,6 +769,16 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
 #endif
 }
 
+static inline uint64_t xgetbv() {
+ #if defined(_MSC_VER)
+   return _xgetbv(0);
+ #else
+   uint32_t xcr0_lo, xcr0_hi;
+   asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
+   return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+ #endif
+ }
+
 static inline uint32_t detect_supported_architectures() {
   uint32_t eax;
   uint32_t ebx = 0;
@@ -787,6 +798,16 @@ static inline uint32_t detect_supported_architectures() {
     host_isa |= instruction_set::PCLMULQDQ;
   }
 
+  if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
+    return host_isa;
+  }
+
+  // xgetbv for checking if the OS saves registers
+  uint64_t xcr0 = xgetbv();
+
+  if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
+    return host_isa;
+  }
   // ECX for EAX=0x7
   eax = 0x7;
   ecx = 0x0; // Sub-leaf = 0
@@ -800,6 +821,9 @@ static inline uint32_t detect_supported_architectures() {
   if (ebx & cpuid_bit::ebx::bmi2) {
     host_isa |= instruction_set::BMI2;
   }
+  if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
+    return host_isa;
+  }
   if (ebx & cpuid_bit::ebx::avx512f) {
     host_isa |= instruction_set::AVX512F;
   }
@@ -822,7 +846,7 @@ static inline uint32_t detect_supported_architectures() {
 }
 #else // fallback
 
-
+// includes 32-bit ARM.
 static inline uint32_t detect_supported_architectures() {
   return instruction_set::DEFAULT;
 }
@@ -870,7 +894,6 @@ simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * i
   return detect_encodings(reinterpret_cast<const char *>(input), length);
 }
 
-
 /**
  * Validate the UTF-8 string. This function may be best when you expect
  * the input to be almost always valid. Otherwise, consider using
@@ -1034,6 +1057,68 @@ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcep
  */
 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
 
+  /**
+   * Convert Latin1 string into UTF8 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
+
+
+    /**
+   * Convert possibly Latin1 string into UTF-16LE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1  string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF-16BE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF-32 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
+
+ /**
+   * Convert possibly broken UTF-8 string into latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
+
 /**
  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string.
  *
@@ -1073,6 +1158,20 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t le
  */
 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
+
+  /**
+   * Convert possibly broken UTF-8 string into latin1 string. with errors
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept;
+
 /**
  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16
  * string and stop on error.
@@ -1139,6 +1238,21 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng
  */
 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
 
+    /**
+   * Convert valid UTF-8 string into latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
+
+
 /**
  * Using native endianness; Convert valid UTF-8 string into UTF-16 string.
  *
@@ -1187,6 +1301,29 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, siz
  */
 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
 
+
+ /**
+   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string bytes
+   * @return the number of bytes required to encode the Latin1 string as UTF-8
+   */
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept;
+
+  /**
+   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in byte
+   * @return the number of bytes required to encode the UTF-8 string as Latin1
+   */
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept;
+
 /**
  * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
  *
@@ -1230,6 +1367,38 @@ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t len
  */
 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
 /**
  * Convert possibly broken UTF-16LE string into UTF-8 string.
  *
@@ -1260,6 +1429,35 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_
  */
 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
 /**
  * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error.
  *
@@ -1319,6 +1517,36 @@ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *
  */
 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
+
+  /**
+   * Convert valid UTF-16LE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert valid UTF-16BE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
 /**
  * Convert valid UTF-16LE string into UTF-8 string.
  *
@@ -1480,6 +1708,21 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input
  */
 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
+
+/*
+   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as Latin1
+   */
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
+
+
 /**
  * Using native endianness; Compute the number of bytes that this UTF-16
  * string would require in UTF-8 format.
@@ -1588,6 +1831,53 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t
  */
 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert valid UTF-32 string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
 /**
  * Convert possibly broken UTF-32 string into UTF-16BE string.
  *
@@ -2021,6 +2311,96 @@ public:
   simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
 
   /**
+   * Convert Latin1 string into UTF8 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0;
+
+
+    /**
+   * Convert possibly Latin1 string into UTF-16LE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1  string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF-16BE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF-32 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+ /**
+   * Convert possibly broken UTF-8 string into latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into latin1 string. with errors
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+    /**
+   * Convert valid UTF-8 string into latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+
+  /**
    * Convert possibly broken UTF-8 string into UTF-16LE string.
    *
    * During the conversion also validation of the input string is done.
@@ -2159,6 +2539,92 @@ public:
   simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
 
   /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16BE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
    * Convert possibly broken UTF-16LE string into UTF-8 string.
    *
    * During the conversion also validation of the input string is done.
@@ -2361,6 +2827,52 @@ public:
   simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
 
   /**
+   * Convert possibly broken UTF-32 string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+
+  simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+
+  simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-32 string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
    * Convert possibly broken UTF-32 string into UTF-8 string.
    *
    * During the conversion also validation of the input string is done.
@@ -2404,6 +2916,17 @@ public:
    */
   simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
 
+
+    /**
+   * Return the number of bytes that this UTF-16 string would require in Latin1 format.
+   *
+   *
+   * @param input         the UTF-16 string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0;
+
   /**
    * Convert possibly broken UTF-32 string into UTF-16LE string.
    *
@@ -2506,6 +3029,15 @@ public:
    */
   virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
 
+ /**
+   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string bytes
+   * @return the number of bytes required to encode the Latin1 string as UTF-8
+   */
+    simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0;
+
   /**
    * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
    *
@@ -2518,6 +3050,41 @@ public:
   simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
 
   /**
+   * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t latin1_length_from_utf32( size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in byte
+   * @return the number of bytes required to encode the UTF-8 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0;
+
+/*
+   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as Latin1
+   */
+  simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0;
+
+  /**
    * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
    *
    * This function does not validate the input.
@@ -2528,6 +3095,18 @@ public:
    */
   simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
 
+
+    /**
+   * Return the number of bytes that this UTF-32 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0;
+
   /*
    * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
    *
diff --git a/src/bun.js/bindings/sqlite/JSSQLStatement.cpp b/src/bun.js/bindings/sqlite/JSSQLStatement.cpp
index a6855fd19..61ac91ba7 100644
--- a/src/bun.js/bindings/sqlite/JSSQLStatement.cpp
+++ b/src/bun.js/bindings/sqlite/JSSQLStatement.cpp
@@ -107,6 +107,50 @@ static JSC_DECLARE_HOST_FUNCTION(jsSQLStatementDeserialize);
         return JSValue::encode(jsUndefined());                                                                     \
     }
 
+class VersionSqlite3 {
+public:
+    explicit VersionSqlite3(sqlite3* db)
+        : db(db)
+        , version(0)
+    {
+    }
+    sqlite3* db;
+    std::atomic<uint64_t> version;
+};
+
+class SQLiteSingleton {
+public:
+    Vector<VersionSqlite3*> databases;
+    Vector<std::atomic<uint64_t>> schema_versions;
+};
+
+static SQLiteSingleton* _instance = nullptr;
+
+static Vector<VersionSqlite3*>& databases()
+{
+    if (!_instance) {
+        _instance = new SQLiteSingleton();
+        _instance->databases = Vector<VersionSqlite3*>();
+        _instance->databases.reserveInitialCapacity(4);
+        _instance->schema_versions = Vector<std::atomic<uint64_t>>();
+    }
+
+    return _instance->databases;
+}
+
+extern "C" void Bun__closeAllSQLiteDatabasesForTermination()
+{
+    if (!_instance) {
+        return;
+    }
+    auto& dbs = _instance->databases;
+
+    for (auto& db : dbs) {
+        if (db->db)
+            sqlite3_close_v2(db->db);
+    }
+}
+
 namespace WebCore {
 using namespace JSC;
 
@@ -272,10 +316,6 @@ void JSSQLStatement::destroy(JSC::JSCell* cell)
 
 void JSSQLStatementConstructor::destroy(JSC::JSCell* cell)
 {
-    JSSQLStatementConstructor* thisObject = static_cast<JSSQLStatementConstructor*>(cell);
-    for (auto version_db : thisObject->databases) {
-        delete version_db;
-    }
 }
 
 static inline bool rebindValue(JSC::JSGlobalObject* lexicalGlobalObject, sqlite3_stmt* stmt, int i, JSC::JSValue value, JSC::ThrowScope& scope, bool clone)
@@ -547,8 +587,8 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementDeserialize, (JSC::JSGlobalObject * lexic
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    auto count = thisObject->databases.size();
-    thisObject->databases.append(new VersionSqlite3(db));
+    auto count = databases().size();
+    databases().append(new VersionSqlite3(db));
     RELEASE_AND_RETURN(scope, JSValue::encode(jsNumber(count)));
 }
 
@@ -565,12 +605,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementSerialize, (JSC::JSGlobalObject * lexical
     }
 
     int32_t dbIndex = callFrame->argument(0).toInt32(lexicalGlobalObject);
-    if (UNLIKELY(dbIndex < 0 || dbIndex >= thisObject->databases.size())) {
+    if (UNLIKELY(dbIndex < 0 || dbIndex >= databases().size())) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    sqlite3* db = thisObject->databases[dbIndex]->db;
+    sqlite3* db = databases()[dbIndex]->db;
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Can't do this on a closed database"_s));
         return JSValue::encode(JSC::jsUndefined());
@@ -606,7 +646,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementLoadExtensionFunction, (JSC::JSGlobalObje
     }
 
     int32_t dbIndex = callFrame->argument(0).toInt32(lexicalGlobalObject);
-    if (UNLIKELY(dbIndex < 0 || dbIndex >= thisObject->databases.size())) {
+    if (UNLIKELY(dbIndex < 0 || dbIndex >= databases().size())) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
@@ -620,7 +660,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementLoadExtensionFunction, (JSC::JSGlobalObje
     auto extensionString = extension.toWTFString(lexicalGlobalObject);
     RETURN_IF_EXCEPTION(scope, {});
 
-    sqlite3* db = thisObject->databases[dbIndex]->db;
+    sqlite3* db = databases()[dbIndex]->db;
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Can't do this on a closed database"_s));
         return JSValue::encode(JSC::jsUndefined());
@@ -661,11 +701,11 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementExecuteFunction, (JSC::JSGlobalObject * l
     }
 
     int32_t handle = callFrame->argument(0).toInt32(lexicalGlobalObject);
-    if (thisObject->databases.size() < handle) {
+    if (databases().size() < handle) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
-    sqlite3* db = thisObject->databases[handle]->db;
+    sqlite3* db = databases()[handle]->db;
 
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Database has closed"_s));
@@ -724,7 +764,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementExecuteFunction, (JSC::JSGlobalObject * l
 
     rc = sqlite3_step(statement);
     if (!sqlite3_stmt_readonly(statement)) {
-        thisObject->databases[handle]->version++;
+        databases()[handle]->version++;
     }
 
     while (rc == SQLITE_ROW) {
@@ -765,12 +805,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementIsInTransactionFunction, (JSC::JSGlobalOb
 
     int32_t handle = dbNumber.toInt32(lexicalGlobalObject);
 
-    if (handle < 0 || handle > thisObject->databases.size()) {
+    if (handle < 0 || handle > databases().size()) {
         throwException(lexicalGlobalObject, scope, createRangeError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    sqlite3* db = thisObject->databases[handle]->db;
+    sqlite3* db = databases()[handle]->db;
 
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Database has closed"_s));
@@ -803,12 +843,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementPrepareStatementFunction, (JSC::JSGlobalO
     }
 
     int32_t handle = dbNumber.toInt32(lexicalGlobalObject);
-    if (handle < 0 || handle > thisObject->databases.size()) {
+    if (handle < 0 || handle > databases().size()) {
         throwException(lexicalGlobalObject, scope, createRangeError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    sqlite3* db = thisObject->databases[handle]->db;
+    sqlite3* db = databases()[handle]->db;
     if (!db) {
         throwException(lexicalGlobalObject, scope, createRangeError(lexicalGlobalObject, "Cannot use a closed database"_s));
         return JSValue::encode(JSC::jsUndefined());
@@ -848,7 +888,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementPrepareStatementFunction, (JSC::JSGlobalO
     auto* structure = JSSQLStatement::createStructure(vm, lexicalGlobalObject, lexicalGlobalObject->objectPrototype());
     // auto* structure = JSSQLStatement::createStructure(vm, globalObject(), thisObject->getDirect(vm, vm.propertyNames->prototype));
     JSSQLStatement* sqlStatement = JSSQLStatement::create(
-        structure, reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject), statement, thisObject->databases[handle]);
+        structure, reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject), statement, databases()[handle]);
     if (bindings.isObject()) {
         auto* castedThis = sqlStatement;
         DO_REBIND(bindings)
@@ -924,8 +964,8 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementOpenStatementFunction, (JSC::JSGlobalObje
     status = sqlite3_db_config(db, SQLITE_DBCONFIG_DEFENSIVE, 1, NULL);
     assert(status == SQLITE_OK);
 
-    auto count = constructor->databases.size();
-    constructor->databases.append(new VersionSqlite3(db));
+    auto count = databases().size();
+    databases().append(new VersionSqlite3(db));
     RELEASE_AND_RETURN(scope, JSValue::encode(jsNumber(count)));
 }
 
@@ -956,12 +996,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementCloseStatementFunction, (JSC::JSGlobalObj
 
     int dbIndex = dbNumber.toInt32(lexicalGlobalObject);
 
-    if (dbIndex < 0 || dbIndex >= constructor->databases.size()) {
+    if (dbIndex < 0 || dbIndex >= databases().size()) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(jsUndefined());
     }
 
-    sqlite3* db = constructor->databases[dbIndex]->db;
+    sqlite3* db = databases()[dbIndex]->db;
     // no-op if already closed
     if (!db) {
         return JSValue::encode(jsUndefined());
@@ -973,7 +1013,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementCloseStatementFunction, (JSC::JSGlobalObj
         return JSValue::encode(jsUndefined());
     }
 
-    constructor->databases[dbIndex]->db = nullptr;
+    databases()[dbIndex]->db = nullptr;
     return JSValue::encode(jsUndefined());
 }
 
diff --git a/src/bun.js/bindings/sqlite/JSSQLStatement.h b/src/bun.js/bindings/sqlite/JSSQLStatement.h
index e63b99fbb..8566fcdd9 100644
--- a/src/bun.js/bindings/sqlite/JSSQLStatement.h
+++ b/src/bun.js/bindings/sqlite/JSSQLStatement.h
@@ -47,17 +47,6 @@
 
 namespace WebCore {
 
-class VersionSqlite3 {
-public:
-    explicit VersionSqlite3(sqlite3* db)
-        : db(db)
-        , version(0)
-    {
-    }
-    sqlite3* db;
-    std::atomic<uint64_t> version;
-};
-
 class JSSQLStatementConstructor final : public JSC::JSFunction {
 public:
     using Base = JSC::JSFunction;
@@ -82,13 +71,9 @@ public:
         return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
     }
 
-    Vector<VersionSqlite3*> databases;
-    Vector<std::atomic<uint64_t>> schema_versions;
-
 private:
     JSSQLStatementConstructor(JSC::VM& vm, NativeExecutable* native, JSGlobalObject* globalObject, JSC::Structure* structure)
         : Base(vm, native, globalObject, structure)
-        , databases()
     {
     }
 
diff --git a/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h b/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h
index 3997c1d88..82a2c6a24 100644
--- a/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h
+++ b/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h
@@ -29,6 +29,7 @@ public:
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForReadableState;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForPendingVirtualModuleResult;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForCallSite;
+    std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForImportMeta;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForNapiExternal;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForRequireResolveFunction;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForBundlerPlugin;
@@ -37,6 +38,7 @@ public:
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForJSMockImplementation;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForJSMockFunction;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForMockWithImplementationCleanupData;
+    std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForProcessObject;
 
 #include "ZigGeneratedClasses+DOMClientIsoSubspaces.h"
     /* --- bun --- */
diff --git a/src/bun.js/bindings/webcore/DOMIsoSubspaces.h b/src/bun.js/bindings/webcore/DOMIsoSubspaces.h
index 4feca1754..f1b290d25 100644
--- a/src/bun.js/bindings/webcore/DOMIsoSubspaces.h
+++ b/src/bun.js/bindings/webcore/DOMIsoSubspaces.h
@@ -30,6 +30,7 @@ public:
     std::unique_ptr<IsoSubspace> m_subspaceForPendingVirtualModuleResult;
     std::unique_ptr<IsoSubspace> m_subspaceForCallSite;
     std::unique_ptr<IsoSubspace> m_subspaceForNapiExternal;
+    std::unique_ptr<IsoSubspace> m_subspaceForImportMeta;
     std::unique_ptr<IsoSubspace> m_subspaceForRequireResolveFunction;
     std::unique_ptr<IsoSubspace> m_subspaceForBundlerPlugin;
     std::unique_ptr<IsoSubspace> m_subspaceForNodeVMScript;
@@ -37,6 +38,7 @@ public:
     std::unique_ptr<IsoSubspace> m_subspaceForJSMockImplementation;
     std::unique_ptr<IsoSubspace> m_subspaceForJSMockFunction;
     std::unique_ptr<IsoSubspace> m_subspaceForMockWithImplementationCleanupData;
+    std::unique_ptr<IsoSubspace> m_subspaceForProcessObject;
 
 #include "ZigGeneratedClasses+DOMIsoSubspaces.h"
     /*-- BUN --*/
diff --git a/src/bun.js/bindings/webcore/EventEmitter.cpp b/src/bun.js/bindings/webcore/EventEmitter.cpp
index 0650d624c..0e273042b 100644
--- a/src/bun.js/bindings/webcore/EventEmitter.cpp
+++ b/src/bun.js/bindings/webcore/EventEmitter.cpp
@@ -35,6 +35,8 @@ bool EventEmitter::addListener(const Identifier& eventType, Ref<EventListener>&&
     }
 
     eventListenersDidChange();
+    if (this->onDidChangeListener)
+        this->onDidChangeListener(*this, eventType, true);
     return true;
 }
 
@@ -62,6 +64,9 @@ bool EventEmitter::removeListener(const Identifier& eventType, EventListener& li
 
     if (data->eventListenerMap.remove(eventType, listener)) {
         eventListenersDidChange();
+
+        if (this->onDidChangeListener)
+            this->onDidChangeListener(*this, eventType, false);
         return true;
     }
     return false;
@@ -93,6 +98,8 @@ bool EventEmitter::removeAllListeners(const Identifier& eventType)
 
     if (data->eventListenerMap.removeAll(eventType)) {
         eventListenersDidChange();
+        if (this->onDidChangeListener)
+            this->onDidChangeListener(*this, eventType, false);
         return true;
     }
     return false;
diff --git a/src/bun.js/bindings/webcore/EventEmitter.h b/src/bun.js/bindings/webcore/EventEmitter.h
index b46bcff5d..8db59c188 100644
--- a/src/bun.js/bindings/webcore/EventEmitter.h
+++ b/src/bun.js/bindings/webcore/EventEmitter.h
@@ -67,6 +67,8 @@ public:
     bool hasActiveEventListeners(const Identifier& eventType) const;
     bool hasEventListeners(JSC::VM& vm, ASCIILiteral eventType) const;
 
+    WTF::Function<void(EventEmitter&, const Identifier& eventName, bool isAdded)> onDidChangeListener = WTF::Function<void(EventEmitter&, const Identifier& eventName, bool isAdded)>(nullptr);
+
     unsigned getMaxListeners() const { return m_maxListeners; };
 
     void setMaxListeners(unsigned count);
@@ -101,7 +103,9 @@ private:
     EventEmitterData* eventTargetData() { return &m_eventTargetData; }
     EventEmitterData* eventTargetDataConcurrently() { return &m_eventTargetData; }
     EventEmitterData& ensureEventEmitterData() { return m_eventTargetData; }
-    void eventListenersDidChange() {}
+    void eventListenersDidChange()
+    {
+    }
 
     void innerInvokeEventListeners(const Identifier&, SimpleEventListenerVector, const MarkedArgumentBuffer& arguments);
     void invalidateEventListenerRegions();
diff --git a/src/bun.js/bindings/webcore/JSCloseEvent.cpp b/src/bun.js/bindings/webcore/JSCloseEvent.cpp
index be07cbcfe..ad7b6ed57 100644
--- a/src/bun.js/bindings/webcore/JSCloseEvent.cpp
+++ b/src/bun.js/bindings/webcore/JSCloseEvent.cpp
@@ -99,7 +99,7 @@ template<> CloseEvent::Init convertDictionary<CloseEvent::Init>(JSGlobalObject&
     if (isNullOrUndefined)
         codeValue = jsUndefined();
     else {
-        codeValue = object->get(&lexicalGlobalObject, Identifier::fromString(vm, "code"_s));
+        codeValue = object->get(&lexicalGlobalObject, WebCore::builtinNames(vm).codePublicName());
         RETURN_IF_EXCEPTION(throwScope, {});
     }
     if (!codeValue.isUndefined()) {
diff --git a/src/bun.js/bindings/webcore/JSEventEmitter.cpp b/src/bun.js/bindings/webcore/JSEventEmitter.cpp
index 995d845cf..959cbd8d7 100644
--- a/src/bun.js/bindings/webcore/JSEventEmitter.cpp
+++ b/src/bun.js/bindings/webcore/JSEventEmitter.cpp
@@ -148,8 +148,8 @@ static const HashTableValue JSEventEmitterPrototypeTableValues[] = {
     { "addListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_addListener, 2 } },
     { "on"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_addListener, 2 } },
     { "once"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_addOnceListener, 2 } },
-    { "prepend"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependListener, 2 } },
-    { "prependOnce"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependOnceListener, 2 } },
+    { "prependListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependListener, 2 } },
+    { "prependOnceListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependOnceListener, 2 } },
     { "removeListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_removeListener, 2 } },
     { "off"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_removeListener, 2 } },
     { "removeAllListeners"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_removeAllListeners, 1 } },
@@ -219,7 +219,7 @@ JSC_DEFINE_CUSTOM_GETTER(jsEventEmitterConstructor, (JSGlobalObject * lexicalGlo
     return JSValue::encode(JSEventEmitter::getConstructor(JSC::getVM(lexicalGlobalObject), prototype->globalObject()));
 }
 
-static inline JSC::EncodedJSValue addListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis, bool once, bool prepend)
+inline JSC::EncodedJSValue JSEventEmitter::addListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis, bool once, bool prepend)
 {
     auto& vm = JSC::getVM(lexicalGlobalObject);
     auto throwScope = DECLARE_THROW_SCOPE(vm);
@@ -251,7 +251,7 @@ static inline JSC::EncodedJSValue addListener(JSC::JSGlobalObject* lexicalGlobal
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_addListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, false, false);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, false, false);
 }
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_setMaxListenersBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
@@ -280,17 +280,17 @@ static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_getMaxListener
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_addOnceListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, true, false);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, true, false);
 }
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_prependListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, false, true);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, false, true);
 }
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_prependOnceListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, true, true);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, true, true);
 }
 
 JSC_DEFINE_HOST_FUNCTION(jsEventEmitterPrototypeFunction_addListener, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
@@ -325,6 +325,11 @@ JSC_DEFINE_HOST_FUNCTION(jsEventEmitterPrototypeFunction_prependOnceListener, (J
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_removeListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
+    return JSEventEmitter::removeListener(lexicalGlobalObject, callFrame, castedThis);
+}
+
+inline JSC::EncodedJSValue JSEventEmitter::removeListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis)
+{
     auto& vm = JSC::getVM(lexicalGlobalObject);
     auto throwScope = DECLARE_THROW_SCOPE(vm);
     JSC::JSValue actualThis = callFrame->thisValue();
diff --git a/src/bun.js/bindings/webcore/JSEventEmitter.h b/src/bun.js/bindings/webcore/JSEventEmitter.h
index 855241011..30d62d792 100644
--- a/src/bun.js/bindings/webcore/JSEventEmitter.h
+++ b/src/bun.js/bindings/webcore/JSEventEmitter.h
@@ -27,6 +27,9 @@ public:
     static EventEmitter* toWrapped(JSC::VM&, JSC::JSValue);
     static void destroy(JSC::JSCell*);
 
+    static inline JSC::EncodedJSValue addListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis, bool once, bool prepend);
+    static inline JSC::EncodedJSValue removeListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis);
+
     DECLARE_INFO;
 
     static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
diff --git a/src/bun.js/bindings/webcore/WebSocket.cpp b/src/bun.js/bindings/webcore/WebSocket.cpp
index a346175df..1d6392f44 100644
--- a/src/bun.js/bindings/webcore/WebSocket.cpp
+++ b/src/bun.js/bindings/webcore/WebSocket.cpp
@@ -458,8 +458,8 @@ ExceptionOr<void> WebSocket::send(const String& message)
         return {};
     }
 
-    if (message.length() > 0)
-        this->sendWebSocketString(message);
+    // 0-length is allowed
+    this->sendWebSocketString(message);
 
     return {};
 }
@@ -477,8 +477,8 @@ ExceptionOr<void> WebSocket::send(ArrayBuffer& binaryData)
     }
     char* data = static_cast<char*>(binaryData.data());
     size_t length = binaryData.byteLength();
-    if (length > 0)
-        this->sendWebSocketData(data, length);
+    // 0-length is allowed
+    this->sendWebSocketData(data, length);
     return {};
 }
 
@@ -498,8 +498,8 @@ ExceptionOr<void> WebSocket::send(ArrayBufferView& arrayBufferView)
     auto buffer = arrayBufferView.unsharedBuffer().get();
     char* baseAddress = reinterpret_cast<char*>(buffer->data()) + arrayBufferView.byteOffset();
     size_t length = arrayBufferView.byteLength();
-    if (length > 0)
-        this->sendWebSocketData(baseAddress, length);
+    // 0-length is allowed
+    this->sendWebSocketData(baseAddress, length);
 
     return {};
 }
@@ -1232,14 +1232,19 @@ extern "C" void WebSocket__didCloseWithErrorCode(WebCore::WebSocket* webSocket,
 
 extern "C" void WebSocket__didReceiveText(WebCore::WebSocket* webSocket, bool clone, const ZigString* str)
 {
-    WTF::String wtf_str = Zig::toString(*str);
-    if (clone) {
-        wtf_str = wtf_str.isolatedCopy();
-    }
-
+    WTF::String wtf_str = clone ? Zig::toStringCopy(*str) : Zig::toString(*str);
     webSocket->didReceiveMessage(WTFMove(wtf_str));
 }
 extern "C" void WebSocket__didReceiveBytes(WebCore::WebSocket* webSocket, uint8_t* bytes, size_t len)
 {
     webSocket->didReceiveBinaryData({ bytes, len });
 }
+
+extern "C" void WebSocket__incrementPendingActivity(WebCore::WebSocket* webSocket)
+{
+    webSocket->incPendingActivityCount();
+}
+extern "C" void WebSocket__decrementPendingActivity(WebCore::WebSocket* webSocket)
+{
+    webSocket->decPendingActivityCount();
+}
+\ No newline at end of file
diff --git a/src/bun.js/bindings/webcore/WebSocket.h b/src/bun.js/bindings/webcore/WebSocket.h
index 42261cfc4..846bd186b 100644
--- a/src/bun.js/bindings/webcore/WebSocket.h
+++ b/src/bun.js/bindings/webcore/WebSocket.h
@@ -111,6 +111,20 @@ public:
         return m_hasPendingActivity.load();
     }
 
+    void incPendingActivityCount()
+    {
+        m_pendingActivityCount++;
+        ref();
+        updateHasPendingActivity();
+    }
+
+    void decPendingActivityCount()
+    {
+        m_pendingActivityCount--;
+        deref();
+        updateHasPendingActivity();
+    }
+
 private:
     typedef union AnyWebSocket {
         WebSocketClient* client;
@@ -147,20 +161,6 @@ private:
     void sendWebSocketString(const String& message);
     void sendWebSocketData(const char* data, size_t length);
 
-    void incPendingActivityCount()
-    {
-        m_pendingActivityCount++;
-        ref();
-        updateHasPendingActivity();
-    }
-
-    void decPendingActivityCount()
-    {
-        m_pendingActivityCount--;
-        deref();
-        updateHasPendingActivity();
-    }
-
     void failAsynchronously();
 
     enum class BinaryType { Blob,
diff --git a/src/bun.js/event_loop.zig b/src/bun.js/event_loop.zig
index 8441bd064..a3ccd16ad 100644
--- a/src/bun.js/event_loop.zig
+++ b/src/bun.js/event_loop.zig
@@ -224,6 +224,7 @@ pub const CppTask = opaque {
 const ThreadSafeFunction = JSC.napi.ThreadSafeFunction;
 const MicrotaskForDefaultGlobalObject = JSC.MicrotaskForDefaultGlobalObject;
 const HotReloadTask = JSC.HotReloader.HotReloadTask;
+const FSWatchTask = JSC.Node.FSWatcher.FSWatchTask;
 const PollPendingModulesTask = JSC.ModuleLoader.AsyncModule.Queue;
 // const PromiseTask = JSInternalPromise.Completion.PromiseTask;
 const GetAddrInfoRequestTask = JSC.DNS.GetAddrInfoRequest.Task;
@@ -242,6 +243,7 @@ pub const Task = TaggedPointerUnion(.{
     HotReloadTask,
     PollPendingModulesTask,
     GetAddrInfoRequestTask,
+    FSWatchTask,
     // PromiseTask,
     // TimeoutTasklet,
 });
@@ -467,6 +469,11 @@ pub const EventLoop = struct {
                     // special case: we return
                     return 0;
                 },
+                .FSWatchTask => {
+                    var transform_task: *FSWatchTask = task.get(FSWatchTask).?;
+                    transform_task.*.run();
+                    transform_task.deinit();
+                },
                 @field(Task.Tag, typeBaseName(@typeName(AnyTask))) => {
                     var any: *AnyTask = task.get(AnyTask).?;
                     any.run();
@@ -666,6 +673,30 @@ pub const EventLoop = struct {
         }
     }
 
+    pub fn waitForPromiseWithTimeout(this: *EventLoop, promise: JSC.AnyPromise, timeout: u32) bool {
+        return switch (promise.status(this.global.vm())) {
+            JSC.JSPromise.Status.Pending => {
+                if (timeout == 0) {
+                    return false;
+                }
+                var start_time = std.time.milliTimestamp();
+                while (promise.status(this.global.vm()) == .Pending) {
+                    this.tick();
+
+                    if (std.time.milliTimestamp() - start_time > timeout) {
+                        return false;
+                    }
+
+                    if (promise.status(this.global.vm()) == .Pending) {
+                        this.autoTick();
+                    }
+                }
+                return true;
+            },
+            else => true,
+        };
+    }
+
     pub fn waitForTasks(this: *EventLoop) void {
         this.tick();
         while (this.tasks.count > 0) {
diff --git a/src/bun.js/javascript.zig b/src/bun.js/javascript.zig
index d458b6e7e..7d2435823 100644
--- a/src/bun.js/javascript.zig
+++ b/src/bun.js/javascript.zig
@@ -196,7 +196,7 @@ pub const SavedSourceMap = struct {
     pub const SourceMapHandler = js_printer.SourceMapHandler.For(SavedSourceMap, onSourceMapChunk);
 
     pub fn putMappings(this: *SavedSourceMap, source: logger.Source, mappings: MutableString) !void {
-        var entry = try this.map.getOrPut(std.hash.Wyhash.hash(0, source.path.text));
+        var entry = try this.map.getOrPut(bun.hash(source.path.text));
         if (entry.found_existing) {
             var value = Value.from(entry.value_ptr.*);
             if (value.get(MappingList)) |source_map_| {
@@ -213,7 +213,7 @@ pub const SavedSourceMap = struct {
     }
 
     pub fn get(this: *SavedSourceMap, path: string) ?MappingList {
-        var mapping = this.map.getEntry(std.hash.Wyhash.hash(0, path)) orelse return null;
+        var mapping = this.map.getEntry(bun.hash(path)) orelse return null;
         switch (Value.from(mapping.value_ptr.*).tag()) {
             (@field(Value.Tag, @typeName(MappingList))) => {
                 return Value.from(mapping.value_ptr.*).as(MappingList).*;
@@ -264,7 +264,7 @@ export fn Bun__readOriginTimer(vm: *JSC.VirtualMachine) u64 {
 
 export fn Bun__readOriginTimerStart(vm: *JSC.VirtualMachine) f64 {
     // timespce to milliseconds
-    return @floatCast(f64, (@intToFloat(f64, vm.origin_timestamp) + JSC.VirtualMachine.origin_relative_epoch) / 1_000_000.0);
+    return @floatCast(f64, (@floatFromInt(f64, vm.origin_timestamp) + JSC.VirtualMachine.origin_relative_epoch) / 1_000_000.0);
 }
 
 // comptime {
@@ -334,6 +334,33 @@ pub export fn Bun__onDidAppendPlugin(jsc_vm: *VirtualMachine, globalObject: *JSG
     jsc_vm.bundler.linker.plugin_runner = &jsc_vm.plugin_runner.?;
 }
 
+pub const ExitHandler = struct {
+    exit_code: u8 = 0,
+
+    pub export fn Bun__getExitCode(vm: *VirtualMachine) u8 {
+        return vm.exit_handler.exit_code;
+    }
+
+    pub export fn Bun__setExitCode(vm: *VirtualMachine, code: u8) void {
+        vm.exit_handler.exit_code = code;
+    }
+
+    extern fn Process__dispatchOnBeforeExit(*JSC.JSGlobalObject, code: u8) void;
+    extern fn Process__dispatchOnExit(*JSC.JSGlobalObject, code: u8) void;
+    extern fn Bun__closeAllSQLiteDatabasesForTermination() void;
+
+    pub fn dispatchOnExit(this: *ExitHandler) void {
+        var vm = @fieldParentPtr(VirtualMachine, "exit_handler", this);
+        Process__dispatchOnExit(vm.global, this.exit_code);
+        Bun__closeAllSQLiteDatabasesForTermination();
+    }
+
+    pub fn dispatchOnBeforeExit(this: *ExitHandler) void {
+        var vm = @fieldParentPtr(VirtualMachine, "exit_handler", this);
+        Process__dispatchOnBeforeExit(vm.global, this.exit_code);
+    }
+};
+
 /// TODO: rename this to ScriptExecutionContext
 /// This is the shared global state for a single JS instance execution
 /// Today, Bun is one VM per thread, so the name "VirtualMachine" sort of makes sense
@@ -376,6 +403,7 @@ pub const VirtualMachine = struct {
     plugin_runner: ?PluginRunner = null,
     is_main_thread: bool = false,
     last_reported_error_for_dedupe: JSValue = .zero,
+    exit_handler: ExitHandler = .{},
 
     /// Do not access this field directly
     /// It exists in the VirtualMachine struct so that
@@ -593,7 +621,10 @@ pub const VirtualMachine = struct {
     pub inline fn nodeFS(this: *VirtualMachine) *Node.NodeFS {
         return this.node_fs orelse brk: {
             this.node_fs = bun.default_allocator.create(Node.NodeFS) catch unreachable;
-            this.node_fs.?.* = Node.NodeFS{};
+            this.node_fs.?.* = Node.NodeFS{
+                // only used when standalone module graph is enabled
+                .vm = if (this.standalone_module_graph != null) this else null,
+            };
             break :brk this.node_fs.?;
         };
     }
@@ -617,7 +648,29 @@ pub const VirtualMachine = struct {
         loop.run();
     }
 
+    pub fn onBeforeExit(this: *VirtualMachine) void {
+        this.exit_handler.dispatchOnBeforeExit();
+        var dispatch = false;
+        while (true) {
+            while (this.eventLoop().tasks.count > 0 or this.active_tasks > 0 or this.uws_event_loop.?.active > 0) : (dispatch = true) {
+                this.tick();
+                this.eventLoop().autoTickActive();
+            }
+
+            if (dispatch) {
+                this.exit_handler.dispatchOnBeforeExit();
+                dispatch = false;
+
+                if (this.eventLoop().tasks.count > 0 or this.active_tasks > 0 or this.uws_event_loop.?.active > 0) continue;
+            }
+
+            break;
+        }
+    }
+
     pub fn onExit(this: *VirtualMachine) void {
+        this.exit_handler.dispatchOnExit();
+
         var rare_data = this.rare_data orelse return;
         var hook = rare_data.cleanup_hook orelse return;
         hook.execute();
@@ -653,6 +706,10 @@ pub const VirtualMachine = struct {
         this.eventLoop().waitForPromise(promise);
     }
 
+    pub fn waitForPromiseWithTimeout(this: *VirtualMachine, promise: JSC.AnyPromise, timeout: u32) bool {
+        return this.eventLoop().waitForPromiseWithTimeout(promise, timeout);
+    }
+
     pub fn waitForTasks(this: *VirtualMachine) void {
         this.eventLoop().waitForTasks();
     }
@@ -794,6 +851,7 @@ pub const VirtualMachine = struct {
 
         vm.bundler.macro_context = null;
         vm.bundler.resolver.store_fd = false;
+        vm.bundler.resolver.prefer_module_field = false;
 
         vm.bundler.resolver.onWakePackageManager = .{
             .context = &vm.modules,
@@ -891,6 +949,7 @@ pub const VirtualMachine = struct {
 
         vm.bundler.macro_context = null;
         vm.bundler.resolver.store_fd = store_fd;
+        vm.bundler.resolver.prefer_module_field = false;
 
         vm.bundler.resolver.onWakePackageManager = .{
             .context = &vm.modules,
@@ -957,6 +1016,7 @@ pub const VirtualMachine = struct {
     }
 
     pub fn refCountedStringWithWasNew(this: *VirtualMachine, new: *bool, input_: []const u8, hash_: ?u32, comptime dupe: bool) *JSC.RefString {
+        JSC.markBinding(@src());
         const hash = hash_ orelse JSC.RefString.computeHash(input_);
 
         var entry = this.ref_strings.getOrPut(hash) catch unreachable;
@@ -1382,7 +1442,7 @@ pub const VirtualMachine = struct {
     // // This double prints
     // pub fn promiseRejectionTracker(global: *JSGlobalObject, promise: *JSPromise, _: JSPromiseRejectionOperation) callconv(.C) JSValue {
     //     const result = promise.result(global.vm());
-    //     if (@enumToInt(VirtualMachine.get().last_error_jsvalue) != @enumToInt(result)) {
+    //     if (@intFromEnum(VirtualMachine.get().last_error_jsvalue) != @intFromEnum(result)) {
     //         VirtualMachine.get().runErrorHandler(result, null);
     //     }
 
@@ -1795,6 +1855,7 @@ pub const VirtualMachine = struct {
                 if (exception) |exception_| {
                     var holder = ZigException.Holder.init();
                     var zig_exception: *ZigException = holder.zigException();
+                    defer zig_exception.deinit();
                     exception_.getStackTrace(&zig_exception.stack);
                     if (zig_exception.stack.frames_len > 0) {
                         if (allow_ansi_color) {
@@ -1823,7 +1884,7 @@ pub const VirtualMachine = struct {
                     iterator(_vm, globalObject, nextValue, ctx.?, false);
                 }
                 inline fn iterator(_: [*c]VM, _: [*c]JSGlobalObject, nextValue: JSValue, ctx: ?*anyopaque, comptime color: bool) void {
-                    var this_ = @intToPtr(*@This(), @ptrToInt(ctx));
+                    var this_ = @ptrFromInt(*@This(), @intFromPtr(ctx));
                     VirtualMachine.get().printErrorlikeObject(nextValue, null, this_.current_exception_list, Writer, this_.writer, color, allow_side_effects);
                 }
             };
@@ -1922,8 +1983,14 @@ pub const VirtualMachine = struct {
 
             while (i < stack.len) : (i += 1) {
                 const frame = stack[@intCast(usize, i)];
-                const file = frame.source_url.slice();
-                const func = frame.function_name.slice();
+                const file_slice = frame.source_url.toUTF8(bun.default_allocator);
+                defer file_slice.deinit();
+                const func_slice = frame.function_name.toUTF8(bun.default_allocator);
+                defer func_slice.deinit();
+
+                const file = file_slice.slice();
+                const func = func_slice.slice();
+
                 if (file.len == 0 and func.len == 0) continue;
 
                 const has_name = std.fmt.count("{any}", .{frame.nameFormatter(
@@ -1973,19 +2040,21 @@ pub const VirtualMachine = struct {
     }
 
     pub fn remapStackFramePositions(this: *VirtualMachine, frames: [*]JSC.ZigStackFrame, frames_count: usize) void {
-        var i: usize = 0;
-        while (i < frames_count) : (i += 1) {
-            if (frames[i].position.isInvalid()) continue;
+        for (frames[0..frames_count]) |*frame| {
+            if (frame.position.isInvalid() or frame.remapped) continue;
+            var sourceURL = frame.source_url.toUTF8(bun.default_allocator);
+            defer sourceURL.deinit();
+
             if (this.source_mappings.resolveMapping(
-                frames[i].source_url.slice(),
-                @max(frames[i].position.line, 0),
-                @max(frames[i].position.column_start, 0),
+                sourceURL.slice(),
+                @max(frame.position.line, 0),
+                @max(frame.position.column_start, 0),
             )) |mapping| {
-                frames[i].position.line = mapping.original.lines;
-                frames[i].position.column_start = mapping.original.columns;
-                frames[i].remapped = true;
+                frame.position.line = mapping.original.lines;
+                frame.position.column_start = mapping.original.columns;
+                frame.remapped = true;
             } else {
-                frames[i].remapped = true;
+                frame.remapped = true;
             }
         }
     }
@@ -2037,14 +2106,16 @@ pub const VirtualMachine = struct {
         if (frames.len == 0) return;
 
         var top = &frames[0];
+        var top_source_url = top.source_url.toUTF8(bun.default_allocator);
+        defer top_source_url.deinit();
         if (this.source_mappings.resolveMapping(
-            top.source_url.slice(),
+            top_source_url.slice(),
             @max(top.position.line, 0),
             @max(top.position.column_start, 0),
         )) |mapping| {
             var log = logger.Log.init(default_allocator);
             var errorable: ErrorableResolvedSource = undefined;
-            var original_source = fetchWithoutOnLoadPlugins(this, this.global, bun.String.init(top.source_url), bun.String.empty, &log, &errorable, .print_source) catch return;
+            var original_source = fetchWithoutOnLoadPlugins(this, this.global, top.source_url, bun.String.empty, &log, &errorable, .print_source) catch return;
             const code = original_source.source_code.toUTF8(bun.default_allocator);
             defer code.deinit();
 
@@ -2066,18 +2137,18 @@ pub const VirtualMachine = struct {
             )) |lines| {
                 var source_lines = exception.stack.source_lines_ptr[0..JSC.ZigException.Holder.source_lines_count];
                 var source_line_numbers = exception.stack.source_lines_numbers[0..JSC.ZigException.Holder.source_lines_count];
-                std.mem.set(ZigString, source_lines, ZigString.Empty);
-                std.mem.set(i32, source_line_numbers, 0);
+                @memset(source_lines, String.empty);
+                @memset(source_line_numbers, 0);
 
                 var lines_ = lines[0..@min(lines.len, source_lines.len)];
                 for (lines_, 0..) |line, j| {
-                    source_lines[(lines_.len - 1) - j] = ZigString.init(line);
+                    source_lines[(lines_.len - 1) - j] = String.init(line);
                     source_line_numbers[j] = top.position.line - @intCast(i32, j) + 1;
                 }
 
                 exception.stack.source_lines_len = @intCast(u8, lines_.len);
 
-                top.position.column_stop = @intCast(i32, source_lines[lines_.len - 1].len);
+                top.position.column_stop = @intCast(i32, source_lines[lines_.len - 1].length());
                 top.position.line_stop = top.position.column_stop;
 
                 // This expression range is no longer accurate
@@ -2089,8 +2160,10 @@ pub const VirtualMachine = struct {
         if (frames.len > 1) {
             for (frames[1..]) |*frame| {
                 if (frame.position.isInvalid()) continue;
+                const source_url = frame.source_url.toUTF8(bun.default_allocator);
+                defer source_url.deinit();
                 if (this.source_mappings.resolveMapping(
-                    frame.source_url.slice(),
+                    source_url.slice(),
                     @max(frame.position.line, 0),
                     @max(frame.position.column_start, 0),
                 )) |mapping| {
@@ -2105,6 +2178,7 @@ pub const VirtualMachine = struct {
     pub fn printErrorInstance(this: *VirtualMachine, error_instance: JSValue, exception_list: ?*ExceptionList, comptime Writer: type, writer: Writer, comptime allow_ansi_color: bool, comptime allow_side_effects: bool) !void {
         var exception_holder = ZigException.Holder.init();
         var exception = exception_holder.zigException();
+        defer exception_holder.deinit();
         this.remapZigException(exception, error_instance, exception_list);
         this.had_errors = true;
 
@@ -2122,15 +2196,18 @@ pub const VirtualMachine = struct {
         var source_lines = exception.stack.sourceLineIterator();
         var last_pad: u64 = 0;
         while (source_lines.untilLast()) |source| {
+            defer source.text.deinit();
+
             const int_size = std.fmt.count("{d}", .{source.line});
             const pad = max_line_number_pad - int_size;
             last_pad = pad;
             try writer.writeByteNTimes(' ', pad);
+
             try writer.print(
                 comptime Output.prettyFmt("<r><d>{d} | <r>{s}\n", allow_ansi_color),
                 .{
                     source.line,
-                    std.mem.trim(u8, source.text, "\n"),
+                    std.mem.trim(u8, source.text.slice(), "\n"),
                 },
             );
         }
@@ -2146,7 +2223,8 @@ pub const VirtualMachine = struct {
             const top_frame = if (exception.stack.frames_len > 0) exception.stack.frames()[0] else null;
             if (top_frame == null or top_frame.?.position.isInvalid()) {
                 defer did_print_name = true;
-                var text = std.mem.trim(u8, source.text, "\n");
+                defer source.text.deinit();
+                var text = std.mem.trim(u8, source.text.slice(), "\n");
 
                 try writer.print(
                     comptime Output.prettyFmt(
@@ -2164,7 +2242,9 @@ pub const VirtualMachine = struct {
                 const int_size = std.fmt.count("{d}", .{source.line});
                 const pad = max_line_number_pad - int_size;
                 try writer.writeByteNTimes(' ', pad);
-                var remainder = std.mem.trim(u8, source.text, "\n");
+                defer source.text.deinit();
+                const text = source.text.slice();
+                var remainder = std.mem.trim(u8, text, "\n");
 
                 try writer.print(
                     comptime Output.prettyFmt(
@@ -2176,7 +2256,7 @@ pub const VirtualMachine = struct {
 
                 if (!top.position.isInvalid()) {
                     var first_non_whitespace = @intCast(u32, top.position.column_start);
-                    while (first_non_whitespace < source.text.len and source.text[first_non_whitespace] == ' ') {
+                    while (first_non_whitespace < text.len and text[first_non_whitespace] == ' ') {
                         first_non_whitespace += 1;
                     }
                     const indent = @intCast(usize, pad) + " | ".len + first_non_whitespace;
@@ -2207,10 +2287,10 @@ pub const VirtualMachine = struct {
         };
 
         var show = Show{
-            .system_code = exception.system_code.len > 0 and !strings.eql(exception.system_code.slice(), name.slice()),
-            .syscall = exception.syscall.len > 0,
+            .system_code = !exception.system_code.eql(name) and !exception.system_code.isEmpty(),
+            .syscall = !exception.syscall.isEmpty(),
             .errno = exception.errno < 0,
-            .path = exception.path.len > 0,
+            .path = !exception.path.isEmpty(),
             .fd = exception.fd != -1,
         };
 
@@ -2250,7 +2330,7 @@ pub const VirtualMachine = struct {
             } else if (show.errno) {
                 try writer.writeAll(" ");
             }
-            try writer.print(comptime Output.prettyFmt(" path<d>: <r><cyan>\"{s}\"<r>\n", allow_ansi_color), .{exception.path});
+            try writer.print(comptime Output.prettyFmt(" path<d>: <r><cyan>\"{}\"<r>\n", allow_ansi_color), .{exception.path});
         }
 
         if (show.fd) {
@@ -2269,12 +2349,12 @@ pub const VirtualMachine = struct {
             } else if (show.errno) {
                 try writer.writeAll(" ");
             }
-            try writer.print(comptime Output.prettyFmt(" code<d>: <r><cyan>\"{s}\"<r>\n", allow_ansi_color), .{exception.system_code});
+            try writer.print(comptime Output.prettyFmt(" code<d>: <r><cyan>\"{}\"<r>\n", allow_ansi_color), .{exception.system_code});
             add_extra_line = true;
         }
 
         if (show.syscall) {
-            try writer.print(comptime Output.prettyFmt(" syscall<d>: <r><cyan>\"{s}\"<r>\n", allow_ansi_color), .{exception.syscall});
+            try writer.print(comptime Output.prettyFmt(" syscall<d>: <r><cyan>\"{}\"<r>\n", allow_ansi_color), .{exception.syscall});
             add_extra_line = true;
         }
 
@@ -2291,22 +2371,22 @@ pub const VirtualMachine = struct {
         try printStackTrace(@TypeOf(writer), writer, exception.stack, allow_ansi_color);
     }
 
-    fn printErrorNameAndMessage(_: *VirtualMachine, name: ZigString, message: ZigString, comptime Writer: type, writer: Writer, comptime allow_ansi_color: bool) !void {
-        if (name.len > 0 and message.len > 0) {
-            const display_name: ZigString = if (!name.is16Bit() and strings.eqlComptime(name.slice(), "Error")) ZigString.init("error") else name;
+    fn printErrorNameAndMessage(_: *VirtualMachine, name: String, message: String, comptime Writer: type, writer: Writer, comptime allow_ansi_color: bool) !void {
+        if (!name.isEmpty() and !message.isEmpty()) {
+            const display_name: String = if (name.eqlComptime("Error")) String.init("error") else name;
 
             try writer.print(comptime Output.prettyFmt("<r><red>{any}<r><d>:<r> <b>{s}<r>\n", allow_ansi_color), .{
                 display_name,
                 message,
             });
-        } else if (name.len > 0) {
-            if (name.is16Bit() or !strings.hasPrefixComptime(name.slice(), "error")) {
-                try writer.print(comptime Output.prettyFmt("<r><red>error<r><d>:<r> <b>{s}<r>\n", allow_ansi_color), .{name});
+        } else if (!name.isEmpty()) {
+            if (!name.hasPrefixComptime("error")) {
+                try writer.print(comptime Output.prettyFmt("<r><red>error<r><d>:<r> <b>{}<r>\n", allow_ansi_color), .{name});
             } else {
-                try writer.print(comptime Output.prettyFmt("<r><red>{s}<r>\n", allow_ansi_color), .{name});
+                try writer.print(comptime Output.prettyFmt("<r><red>{}<r>\n", allow_ansi_color), .{name});
             }
-        } else if (message.len > 0) {
-            try writer.print(comptime Output.prettyFmt("<r><red>error<r><d>:<r> <b>{s}<r>\n", allow_ansi_color), .{message});
+        } else if (!message.isEmpty()) {
+            try writer.print(comptime Output.prettyFmt("<r><red>error<r><d>:<r> <b>{}<r>\n", allow_ansi_color), .{message});
         } else {
             try writer.print(comptime Output.prettyFmt("<r><red>error<r>\n", allow_ansi_color), .{});
         }
@@ -2374,7 +2454,7 @@ pub const EventListenerMixin = struct {
         const FetchEventRejectionHandler = struct {
             pub fn onRejection(_ctx: *anyopaque, err: anyerror, fetch_event: *FetchEvent, value: JSValue) void {
                 onError(
-                    @intToPtr(*CtxType, @ptrToInt(_ctx)),
+                    @ptrFromInt(*CtxType, @intFromPtr(_ctx)),
                     err,
                     value,
                     fetch_event.request_context.?,
@@ -2604,6 +2684,13 @@ pub fn NewHotReloader(comptime Ctx: type, comptime EventLoopType: type, comptime
             return this.tombstones.get(key);
         }
 
+        pub fn onError(
+            _: *@This(),
+            err: anyerror,
+        ) void {
+            Output.prettyErrorln("<r>Watcher crashed: <red><b>{s}<r>", .{@errorName(err)});
+        }
+
         pub fn onFileUpdate(
             this: *@This(),
             events: []watcher.WatchEvent,
@@ -2763,10 +2850,10 @@ pub fn NewHotReloader(comptime Ctx: type, comptime EventLoopType: type, comptime
                                             break :brk path_string.slice();
                                         } else {
                                             var file_path_without_trailing_slash = std.mem.trimRight(u8, file_path, std.fs.path.sep_str);
-                                            @memcpy(&_on_file_update_path_buf, file_path_without_trailing_slash.ptr, file_path_without_trailing_slash.len);
+                                            @memcpy(_on_file_update_path_buf[0..file_path_without_trailing_slash.len], file_path_without_trailing_slash);
                                             _on_file_update_path_buf[file_path_without_trailing_slash.len] = std.fs.path.sep;
 
-                                            @memcpy(_on_file_update_path_buf[file_path_without_trailing_slash.len + 1 ..].ptr, changed_name.ptr, changed_name.len);
+                                            @memcpy(_on_file_update_path_buf[file_path_without_trailing_slash.len..][0..changed_name.len], changed_name);
                                             const path_slice = _on_file_update_path_buf[0 .. file_path_without_trailing_slash.len + changed_name.len + 1];
                                             file_hash = @This().Watcher.getHash(path_slice);
                                             break :brk path_slice;
diff --git a/src/bun.js/javascript_core_c_api.zig b/src/bun.js/javascript_core_c_api.zig
index 37f9df15d..aba143a81 100644
--- a/src/bun.js/javascript_core_c_api.zig
+++ b/src/bun.js/javascript_core_c_api.zig
@@ -9,7 +9,7 @@ const std = @import("std");
 const cpp = @import("./bindings/bindings.zig");
 const generic = opaque {
     pub fn value(this: *const @This()) cpp.JSValue {
-        return @intToEnum(cpp.JSValue, @bitCast(cpp.JSValue.Type, @ptrToInt(this)));
+        return @enumFromInt(cpp.JSValue, @bitCast(cpp.JSValue.Type, @intFromPtr(this)));
     }
 
     pub inline fn bunVM(this: *@This()) *@import("root").bun.JSC.VirtualMachine {
@@ -120,13 +120,13 @@ pub const JSType = enum(c_uint) {
     kJSTypeObject,
     kJSTypeSymbol,
 };
-pub const kJSTypeUndefined = @enumToInt(JSType.kJSTypeUndefined);
-pub const kJSTypeNull = @enumToInt(JSType.kJSTypeNull);
-pub const kJSTypeBoolean = @enumToInt(JSType.kJSTypeBoolean);
-pub const kJSTypeNumber = @enumToInt(JSType.kJSTypeNumber);
-pub const kJSTypeString = @enumToInt(JSType.kJSTypeString);
-pub const kJSTypeObject = @enumToInt(JSType.kJSTypeObject);
-pub const kJSTypeSymbol = @enumToInt(JSType.kJSTypeSymbol);
+pub const kJSTypeUndefined = @intFromEnum(JSType.kJSTypeUndefined);
+pub const kJSTypeNull = @intFromEnum(JSType.kJSTypeNull);
+pub const kJSTypeBoolean = @intFromEnum(JSType.kJSTypeBoolean);
+pub const kJSTypeNumber = @intFromEnum(JSType.kJSTypeNumber);
+pub const kJSTypeString = @intFromEnum(JSType.kJSTypeString);
+pub const kJSTypeObject = @intFromEnum(JSType.kJSTypeObject);
+pub const kJSTypeSymbol = @intFromEnum(JSType.kJSTypeSymbol);
 pub const JSTypedArrayType = enum(c_uint) {
     kJSTypedArrayTypeInt8Array,
     kJSTypedArrayTypeInt16Array,
@@ -141,17 +141,17 @@ pub const JSTypedArrayType = enum(c_uint) {
     kJSTypedArrayTypeNone,
     _,
 };
-pub const kJSTypedArrayTypeInt8Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeInt8Array);
-pub const kJSTypedArrayTypeInt16Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeInt16Array);
-pub const kJSTypedArrayTypeInt32Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeInt32Array);
-pub const kJSTypedArrayTypeUint8Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeUint8Array);
-pub const kJSTypedArrayTypeUint8ClampedArray = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeUint8ClampedArray);
-pub const kJSTypedArrayTypeUint16Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeUint16Array);
-pub const kJSTypedArrayTypeUint32Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeUint32Array);
-pub const kJSTypedArrayTypeFloat32Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeFloat32Array);
-pub const kJSTypedArrayTypeFloat64Array = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeFloat64Array);
-pub const kJSTypedArrayTypeArrayBuffer = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeArrayBuffer);
-pub const kJSTypedArrayTypeNone = @enumToInt(JSTypedArrayType.kJSTypedArrayTypeNone);
+pub const kJSTypedArrayTypeInt8Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeInt8Array);
+pub const kJSTypedArrayTypeInt16Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeInt16Array);
+pub const kJSTypedArrayTypeInt32Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeInt32Array);
+pub const kJSTypedArrayTypeUint8Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeUint8Array);
+pub const kJSTypedArrayTypeUint8ClampedArray = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeUint8ClampedArray);
+pub const kJSTypedArrayTypeUint16Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeUint16Array);
+pub const kJSTypedArrayTypeUint32Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeUint32Array);
+pub const kJSTypedArrayTypeFloat32Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeFloat32Array);
+pub const kJSTypedArrayTypeFloat64Array = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeFloat64Array);
+pub const kJSTypedArrayTypeArrayBuffer = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeArrayBuffer);
+pub const kJSTypedArrayTypeNone = @intFromEnum(JSTypedArrayType.kJSTypedArrayTypeNone);
 pub extern fn JSValueGetType(ctx: JSContextRef, value: JSValueRef) JSType;
 pub extern fn JSValueIsUndefined(ctx: JSContextRef, value: JSValueRef) bool;
 pub extern fn JSValueIsNull(ctx: JSContextRef, value: JSValueRef) bool;
@@ -187,7 +187,7 @@ pub inline fn JSValueUnprotect(ctx: JSContextRef, value: JSValueRef) void {
     };
     if (comptime log_protection) {
         const Output = @import("root").bun.Output;
-        Output.debug("[unprotect] {d}\n", .{@ptrToInt(value)});
+        Output.debug("[unprotect] {d}\n", .{@intFromPtr(value)});
     }
     // wrapper exists to make it easier to set a breakpoint
     Wrapped.JSValueUnprotect(ctx, value);
@@ -199,7 +199,7 @@ pub inline fn JSValueProtect(ctx: JSContextRef, value: JSValueRef) void {
     };
     if (comptime log_protection) {
         const Output = @import("root").bun.Output;
-        Output.debug("[protect] {d}\n", .{@ptrToInt(value)});
+        Output.debug("[protect] {d}\n", .{@intFromPtr(value)});
     }
     // wrapper exists to make it easier to set a breakpoint
     Wrapped.JSValueProtect(ctx, value);
@@ -212,18 +212,18 @@ pub const JSPropertyAttributes = enum(c_uint) {
     kJSPropertyAttributeDontDelete = 8,
     _,
 };
-pub const kJSPropertyAttributeNone = @enumToInt(JSPropertyAttributes.kJSPropertyAttributeNone);
-pub const kJSPropertyAttributeReadOnly = @enumToInt(JSPropertyAttributes.kJSPropertyAttributeReadOnly);
-pub const kJSPropertyAttributeDontEnum = @enumToInt(JSPropertyAttributes.kJSPropertyAttributeDontEnum);
-pub const kJSPropertyAttributeDontDelete = @enumToInt(JSPropertyAttributes.kJSPropertyAttributeDontDelete);
+pub const kJSPropertyAttributeNone = @intFromEnum(JSPropertyAttributes.kJSPropertyAttributeNone);
+pub const kJSPropertyAttributeReadOnly = @intFromEnum(JSPropertyAttributes.kJSPropertyAttributeReadOnly);
+pub const kJSPropertyAttributeDontEnum = @intFromEnum(JSPropertyAttributes.kJSPropertyAttributeDontEnum);
+pub const kJSPropertyAttributeDontDelete = @intFromEnum(JSPropertyAttributes.kJSPropertyAttributeDontDelete);
 pub const JSClassAttributes = enum(c_uint) {
     kJSClassAttributeNone = 0,
     kJSClassAttributeNoAutomaticPrototype = 2,
     _,
 };
 
-pub const kJSClassAttributeNone = @enumToInt(JSClassAttributes.kJSClassAttributeNone);
-pub const kJSClassAttributeNoAutomaticPrototype = @enumToInt(JSClassAttributes.kJSClassAttributeNoAutomaticPrototype);
+pub const kJSClassAttributeNone = @intFromEnum(JSClassAttributes.kJSClassAttributeNone);
+pub const kJSClassAttributeNoAutomaticPrototype = @intFromEnum(JSClassAttributes.kJSClassAttributeNoAutomaticPrototype);
 pub const JSObjectInitializeCallback = *const fn (JSContextRef, JSObjectRef) callconv(.C) void;
 pub const JSObjectFinalizeCallback = *const fn (JSObjectRef) callconv(.C) void;
 pub const JSObjectHasPropertyCallback = *const fn (JSContextRef, JSObjectRef, JSStringRef) callconv(.C) bool;
diff --git a/src/bun.js/module_loader.zig b/src/bun.js/module_loader.zig
index d115573be..e7e4d700e 100644
--- a/src/bun.js/module_loader.zig
+++ b/src/bun.js/module_loader.zig
@@ -271,14 +271,8 @@ pub const ModuleLoader = struct {
 
             pub fn onPoll(this: *Queue) void {
                 debug("onPoll", .{});
-                var pm = this.vm().packageManager();
-
-                this.runTasks();
-                _ = pm.scheduleTasks();
                 this.runTasks();
-
                 this.pollModules();
-                _ = pm.flushDependencyQueue();
             }
 
             pub fn runTasks(this: *Queue) void {
@@ -988,6 +982,14 @@ pub const ModuleLoader = struct {
                     jsc_vm.bundler.options.macro_remap;
 
                 var fallback_source: logger.Source = undefined;
+
+                // Usually, we want to close the input file automatically.
+                //
+                // If we're re-using the file descriptor from the fs watcher
+                // Do not close it because that will break the kqueue-based watcher
+                //
+                var should_close_input_file_fd = fd == null;
+
                 var input_file_fd: StoredFileDescriptorType = 0;
                 var parse_options = Bundler.ParseOptions{
                     .allocator = allocator,
@@ -1008,6 +1010,13 @@ pub const ModuleLoader = struct {
                         jsc_vm.main_hash == hash and
                         strings.eqlLong(jsc_vm.main, path.text, false),
                 };
+                defer {
+                    if (should_close_input_file_fd and input_file_fd != 0) {
+                        _ = bun.JSC.Node.Syscall.close(input_file_fd);
+                        input_file_fd = 0;
+                    }
+                }
+
                 if (is_node_override) {
                     if (NodeFallbackModules.contentsFromPath(specifier)) |code| {
                         const fallback_path = Fs.Path.initWithNamespace(specifier, "node");
@@ -1025,6 +1034,7 @@ pub const ModuleLoader = struct {
                         if (jsc_vm.isWatcherEnabled()) {
                             if (input_file_fd != 0) {
                                 if (jsc_vm.bun_watcher != null and !is_node_override and std.fs.path.isAbsolute(path.text) and !strings.contains(path.text, "node_modules")) {
+                                    should_close_input_file_fd = false;
                                     jsc_vm.bun_watcher.?.addFile(
                                         input_file_fd,
                                         path.text,
@@ -1065,6 +1075,7 @@ pub const ModuleLoader = struct {
                     if (jsc_vm.isWatcherEnabled()) {
                         if (input_file_fd != 0) {
                             if (jsc_vm.bun_watcher != null and !is_node_override and std.fs.path.isAbsolute(path.text) and !strings.contains(path.text, "node_modules")) {
+                                should_close_input_file_fd = false;
                                 jsc_vm.bun_watcher.?.addFile(
                                     input_file_fd,
                                     path.text,
@@ -1225,6 +1236,25 @@ pub const ModuleLoader = struct {
                     return resolved_source;
                 }
 
+                // Pass along package.json type "module" if set.
+                const tag = brk: {
+                    if (parse_result.ast.exports_kind == .cjs and parse_result.source.path.isFile()) {
+                        var actual_package_json: *PackageJSON = package_json orelse brk2: {
+                            // this should already be cached virtually always so it's fine to do this
+                            var dir_info = (jsc_vm.bundler.resolver.readDirInfo(parse_result.source.path.name.dir) catch null) orelse
+                                break :brk .javascript;
+
+                            break :brk2 dir_info.package_json orelse dir_info.enclosing_package_json;
+                        } orelse break :brk .javascript;
+
+                        if (actual_package_json.module_type == .esm) {
+                            break :brk ResolvedSource.Tag.package_json_type_module;
+                        }
+                    }
+
+                    break :brk ResolvedSource.Tag.javascript;
+                };
+
                 return .{
                     .allocator = null,
                     .source_code = bun.String.createLatin1(printer.ctx.getWritten()),
@@ -1245,6 +1275,8 @@ pub const ModuleLoader = struct {
 
                     // having JSC own the memory causes crashes
                     .hash = 0,
+
+                    .tag = tag,
                 };
             },
             // provideFetch() should be called
@@ -1298,7 +1330,7 @@ pub const ModuleLoader = struct {
                             var encoded = JSC.EncodedJSValue{
                                 .asPtr = globalThis,
                             };
-                            const globalValue = @intToEnum(JSC.JSValue, encoded.asInt64);
+                            const globalValue = @enumFromInt(JSC.JSValue, encoded.asInt64);
                             globalValue.put(
                                 globalThis,
                                 JSC.ZigString.static("wasmSourceBytes"),
@@ -1836,8 +1868,7 @@ pub const ModuleLoader = struct {
                 .@"node:wasi" => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .@"node:wasi", "node/wasi.js", specifier),
                 .@"node:zlib" => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .@"node:zlib", "node/zlib.js", specifier),
 
-                .@"detect-libc" => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .depd, if (Environment.isLinux) "thirdparty/detect-libc.linux.js" else "thirdparty/detect-libc.js", specifier),
-                .depd => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .depd, "thirdparty/depd.js", specifier),
+                .@"detect-libc" => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .@"detect-libc", if (Environment.isLinux) "thirdparty/detect-libc.linux.js" else "thirdparty/detect-libc.js", specifier),
                 .undici => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .undici, "thirdparty/undici.js", specifier),
                 .ws => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .ws, "thirdparty/ws.js", specifier),
 
@@ -1851,7 +1882,9 @@ pub const ModuleLoader = struct {
                 .@"node:v8" => return jsResolvedSource(jsc_vm, jsc_vm.load_builtins_from_path, .@"node:v8", "node/v8.js", specifier),
             }
         } else if (specifier.hasPrefixComptime(js_ast.Macro.namespaceWithColon)) {
-            if (jsc_vm.macro_entry_points.get(MacroEntryPoint.generateIDFromSpecifier(specifier.byteSlice()))) |entry| {
+            const spec = specifier.toUTF8(bun.default_allocator);
+            defer spec.deinit();
+            if (jsc_vm.macro_entry_points.get(MacroEntryPoint.generateIDFromSpecifier(spec.slice()))) |entry| {
                 return ResolvedSource{
                     .allocator = null,
                     .source_code = bun.String.create(entry.source.contents),
@@ -1864,10 +1897,9 @@ pub const ModuleLoader = struct {
             return ResolvedSource{
                 .allocator = null,
                 .source_code = bun.String.static(
-                    \\const symbol = Symbol.for("CommonJS");
-                    \\const lazy = globalThis[Symbol.for("Bun.lazy")];
-                    \\var masqueradesAsUndefined = lazy("masqueradesAsUndefined");
-                    \\masqueradesAsUndefined[symbol] = 0;
+                    \\var masqueradesAsUndefined=globalThis[Symbol.for("Bun.lazy")]("masqueradesAsUndefined");
+                    \\masqueradesAsUndefined[Symbol.for("CommonJS")]=0;
+                    \\masqueradesAsUndefined.default=masqueradesAsUndefined;
                     \\export default masqueradesAsUndefined;
                     \\
                 ),
@@ -2021,7 +2053,6 @@ pub const HardcodedModule = enum {
     @"node:vm",
     @"node:wasi",
     @"node:zlib",
-    depd,
     undici,
     ws,
     // These are all not implemented yet, but are stubbed
@@ -2047,7 +2078,6 @@ pub const HardcodedModule = enum {
             .{ "bun:main", HardcodedModule.@"bun:main" },
             .{ "bun:sqlite", HardcodedModule.@"bun:sqlite" },
             .{ "bun:events_native", HardcodedModule.@"bun:events_native" },
-            .{ "depd", HardcodedModule.depd },
             .{ "detect-libc", HardcodedModule.@"detect-libc" },
             .{ "node:assert", HardcodedModule.@"node:assert" },
             .{ "node:assert/strict", HardcodedModule.@"node:assert/strict" },
@@ -2118,7 +2148,6 @@ pub const HardcodedModule = enum {
             .{ "bun:events_native", .{ .path = "bun:events_native" } },
             .{ "child_process", .{ .path = "node:child_process" } },
             .{ "crypto", .{ .path = "node:crypto" } },
-            .{ "depd", .{ .path = "depd" } },
             .{ "detect-libc", .{ .path = "detect-libc" } },
             .{ "detect-libc/lib/detect-libc.js", .{ .path = "detect-libc" } },
             .{ "dns", .{ .path = "node:dns" } },
diff --git a/src/bun.js/modules/BufferModule.h b/src/bun.js/modules/BufferModule.h
index 42eab5173..6e6e39e9c 100644
--- a/src/bun.js/modules/BufferModule.h
+++ b/src/bun.js/modules/BufferModule.h
@@ -2,11 +2,127 @@
 #include "../bindings/ZigGlobalObject.h"
 #include "JavaScriptCore/JSGlobalObject.h"
 #include "JavaScriptCore/ObjectConstructor.h"
+#include "simdutf.h"
 
 namespace Zig {
 using namespace WebCore;
 using namespace JSC;
 
+// TODO: Add DOMJIT fast path
+JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_isUtf8,
+                         (JSC::JSGlobalObject * lexicalGlobalObject,
+                          JSC::CallFrame *callframe)) {
+  auto throwScope = DECLARE_THROW_SCOPE(lexicalGlobalObject->vm());
+
+  auto buffer = callframe->argument(0);
+  auto *bufferView = JSC::jsDynamicCast<JSC::JSArrayBufferView *>(buffer);
+  const char *ptr = nullptr;
+  size_t byteLength = 0;
+  if (bufferView) {
+    if (UNLIKELY(bufferView->isDetached())) {
+      throwTypeError(lexicalGlobalObject, throwScope,
+                     "ArrayBufferView is detached"_s);
+      return JSValue::encode({});
+    }
+
+    byteLength = bufferView->byteLength();
+
+    if (byteLength == 0) {
+      return JSValue::encode(jsBoolean(true));
+    }
+
+    ptr = reinterpret_cast<const char *>(bufferView->vector());
+  } else if (auto *arrayBuffer =
+                 JSC::jsDynamicCast<JSC::JSArrayBuffer *>(buffer)) {
+    auto *impl = arrayBuffer->impl();
+
+    if (!impl) {
+      return JSValue::encode(jsBoolean(true));
+    }
+
+    if (UNLIKELY(impl->isDetached())) {
+      throwTypeError(lexicalGlobalObject, throwScope,
+                     "ArrayBuffer is detached"_s);
+      return JSValue::encode({});
+    }
+
+    byteLength = impl->byteLength();
+
+    if (byteLength == 0) {
+      return JSValue::encode(jsBoolean(true));
+    }
+
+    ptr = reinterpret_cast<const char *>(impl->data());
+  } else {
+    throwVMError(
+        lexicalGlobalObject, throwScope,
+        createTypeError(lexicalGlobalObject,
+                        "First argument must be an ArrayBufferView"_s));
+    return JSValue::encode({});
+  }
+
+  RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(
+                                     simdutf::validate_utf8(ptr, byteLength))));
+}
+
+// TODO: Add DOMJIT fast path
+JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_isAscii,
+                         (JSC::JSGlobalObject * lexicalGlobalObject,
+                          JSC::CallFrame *callframe)) {
+  auto throwScope = DECLARE_THROW_SCOPE(lexicalGlobalObject->vm());
+
+  auto buffer = callframe->argument(0);
+  auto *bufferView = JSC::jsDynamicCast<JSC::JSArrayBufferView *>(buffer);
+  const char *ptr = nullptr;
+  size_t byteLength = 0;
+  if (bufferView) {
+
+    if (UNLIKELY(bufferView->isDetached())) {
+      throwTypeError(lexicalGlobalObject, throwScope,
+                     "ArrayBufferView is detached"_s);
+      return JSValue::encode({});
+    }
+
+    byteLength = bufferView->byteLength();
+
+    if (byteLength == 0) {
+      return JSValue::encode(jsBoolean(true));
+    }
+
+    ptr = reinterpret_cast<const char *>(bufferView->vector());
+  } else if (auto *arrayBuffer =
+                 JSC::jsDynamicCast<JSC::JSArrayBuffer *>(buffer)) {
+    auto *impl = arrayBuffer->impl();
+    if (UNLIKELY(impl->isDetached())) {
+      throwTypeError(lexicalGlobalObject, throwScope,
+                     "ArrayBuffer is detached"_s);
+      return JSValue::encode({});
+    }
+
+    if (!impl) {
+      return JSValue::encode(jsBoolean(true));
+    }
+
+    byteLength = impl->byteLength();
+
+    if (byteLength == 0) {
+      return JSValue::encode(jsBoolean(true));
+    }
+
+    ptr = reinterpret_cast<const char *>(impl->data());
+  } else {
+    throwVMError(
+        lexicalGlobalObject, throwScope,
+        createTypeError(lexicalGlobalObject,
+                        "First argument must be an ArrayBufferView"_s));
+    return JSValue::encode({});
+  }
+
+  RELEASE_AND_RETURN(
+      throwScope,
+      JSValue::encode(jsBoolean(simdutf::validate_ascii(ptr, byteLength))));
+}
+
 JSC_DEFINE_HOST_FUNCTION(jsFunctionNotImplemented,
                          (JSGlobalObject * globalObject,
                           CallFrame *callFrame)) {
@@ -29,10 +145,13 @@ inline void generateBufferSourceCode(JSC::JSGlobalObject *lexicalGlobalObject,
   JSC::JSObject *defaultObject = JSC::constructEmptyObject(
       globalObject, globalObject->objectPrototype(), 12);
 
-  defaultObject->putDirect(vm,
-                           PropertyName(Identifier::fromUid(
-                               vm.symbolRegistry().symbolForKey("CommonJS"_s))),
-                           jsNumber(0), 0);
+  auto CommonJS =
+      Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s));
+
+  defaultObject->putDirect(vm, PropertyName(CommonJS), jsNumber(0), 0);
+
+  exportNames.append(CommonJS);
+  exportValues.append(jsNumber(0));
 
   auto exportProperty = [&](JSC::Identifier name, JSC::JSValue value) {
     exportNames.append(name);
@@ -103,6 +222,20 @@ inline void generateBufferSourceCode(JSC::JSGlobalObject *lexicalGlobalObject,
   exportProperty(JSC::Identifier::fromString(vm, "resolveObjectURL"_s),
                  resolveObjectURL);
 
+  exportProperty(JSC::Identifier::fromString(vm, "isAscii"_s),
+                 JSC::JSFunction::create(vm, globalObject, 1, "isAscii"_s,
+                                         jsBufferConstructorFunction_isAscii,
+                                         ImplementationVisibility::Public,
+                                         NoIntrinsic,
+                                         jsBufferConstructorFunction_isUtf8));
+
+  exportProperty(JSC::Identifier::fromString(vm, "isUtf8"_s),
+                 JSC::JSFunction::create(vm, globalObject, 1, "isUtf8"_s,
+                                         jsBufferConstructorFunction_isUtf8,
+                                         ImplementationVisibility::Public,
+                                         NoIntrinsic,
+                                         jsBufferConstructorFunction_isUtf8));
+
   exportNames.append(vm.propertyNames->defaultKeyword);
   exportValues.append(defaultObject);
 }
diff --git a/src/bun.js/modules/NodeModuleModule.cpp b/src/bun.js/modules/NodeModuleModule.cpp
index f11277709..3019ea95c 100644
--- a/src/bun.js/modules/NodeModuleModule.cpp
+++ b/src/bun.js/modules/NodeModuleModule.cpp
@@ -19,34 +19,15 @@ JSC_DEFINE_HOST_FUNCTION(jsFunctionNodeModuleCreateRequire,
     return JSC::JSValue::encode(JSC::jsUndefined());
   }
 
-  auto str = callFrame->uncheckedArgument(0).toStringOrNull(globalObject);
+  auto val = callFrame->uncheckedArgument(0).toWTFString(globalObject);
   RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::jsUndefined()));
-  WTF::String val = str->value(globalObject);
-  auto *meta = Zig::ImportMetaObject::create(globalObject, str);
   auto clientData = WebCore::clientData(vm);
-  auto requireFunction =
-      Zig::ImportMetaObject::createRequireFunction(vm, globalObject, val);
-  auto nameStr = jsCast<JSFunction *>(requireFunction)->name(vm);
-  JSC::JSBoundFunction *boundRequireFunction =
-      JSC::JSBoundFunction::create(vm, globalObject, requireFunction, meta,
-                                   ArgList(), 0, jsString(vm, nameStr));
-  boundRequireFunction->putDirect(
-      vm, clientData->builtinNames().resolvePublicName(),
-      requireFunction->getDirect(
-          vm, clientData->builtinNames().resolvePublicName()),
-      0);
-
-  RELEASE_AND_RETURN(scope, JSValue::encode(boundRequireFunction));
-}
-JSC_DEFINE_HOST_FUNCTION(jsFunctionNodeModulePaths,
-                         (JSC::JSGlobalObject * globalObject,
-                          JSC::CallFrame *callFrame)) {
-  return JSC::JSValue::encode(JSC::JSArray::create(
-      globalObject->vm(),
-      globalObject->arrayStructureForIndexingTypeDuringAllocation(
-          ArrayWithContiguous),
-      0));
+  RELEASE_AND_RETURN(
+      scope, JSValue::encode(Zig::ImportMetaObject::createRequireFunction(
+                 vm, globalObject, val)));
 }
+extern "C" EncodedJSValue Resolver__nodeModulePathsForJS(JSGlobalObject *,
+                                                         CallFrame *);
 
 JSC_DEFINE_HOST_FUNCTION(jsFunctionFindSourceMap,
                          (JSGlobalObject * globalObject,
@@ -122,48 +103,56 @@ void generateNodeModuleModule(JSC::JSGlobalObject *globalObject,
                               JSC::MarkedArgumentBuffer &exportValues) {
   JSC::VM &vm = globalObject->vm();
 
-  exportValues.append(JSFunction::create(
-      vm, globalObject, 1, String("createRequire"_s),
-      jsFunctionNodeModuleCreateRequire, ImplementationVisibility::Public));
-  exportValues.append(JSFunction::create(vm, globalObject, 1, String("paths"_s),
-                                         jsFunctionNodeModulePaths,
-                                         ImplementationVisibility::Public));
-  exportValues.append(JSFunction::create(
-      vm, globalObject, 1, String("findSourceMap"_s), jsFunctionFindSourceMap,
-      ImplementationVisibility::Public));
-  exportValues.append(JSFunction::create(
-      vm, globalObject, 0, String("syncBuiltinExports"_s),
-      jsFunctionSyncBuiltinExports, ImplementationVisibility::Public));
-  exportValues.append(
-      JSFunction::create(vm, globalObject, 1, String("SourceMap"_s),
-                         jsFunctionSourceMap, ImplementationVisibility::Public,
-                         NoIntrinsic, jsFunctionSourceMap, nullptr));
-
-  exportNames.append(JSC::Identifier::fromString(vm, "createRequire"_s));
-  exportNames.append(JSC::Identifier::fromString(vm, "paths"_s));
-  exportNames.append(JSC::Identifier::fromString(vm, "findSourceMap"_s));
-  exportNames.append(JSC::Identifier::fromString(vm, "syncBuiltinExports"_s));
-  exportNames.append(JSC::Identifier::fromString(vm, "SourceMap"_s));
-
-  // note: this is not technically correct
-  // it doesn't set process.mainModule
-  exportNames.append(JSC::Identifier::fromString(vm, "_resolveFileName"_s));
-  exportValues.append(JSFunction::create(
-      vm, globalObject, 3, String("_resolveFileName"_s),
-      jsFunctionResolveFileName, ImplementationVisibility::Public));
-
-  exportNames.append(JSC::Identifier::fromString(vm, "_nodeModulePaths"_s));
-  exportValues.append(JSFunction::create(
-      vm, globalObject, 0, String("_nodeModulePaths"_s),
-      jsFunctionNodeModulePaths, ImplementationVisibility::Public));
-
-  exportNames.append(JSC::Identifier::fromString(vm, "_cache"_s));
-  exportValues.append(JSC::constructEmptyObject(globalObject));
-
-  exportNames.append(JSC::Identifier::fromString(vm, "builtinModules"_s));
-
-  exportNames.append(JSC::Identifier::fromString(vm, "globalPaths"_s));
-  exportValues.append(JSC::constructEmptyArray(globalObject, 0));
+  JSObject *defaultObject = JSC::constructEmptyObject(
+      vm, globalObject->nullPrototypeObjectStructure());
+  auto append = [&](Identifier name, JSValue value) {
+    defaultObject->putDirect(vm, name, value);
+    exportNames.append(name);
+    exportValues.append(value);
+  };
+
+  append(Identifier::fromString(vm, "createRequire"_s),
+         JSFunction::create(vm, globalObject, 1, String("createRequire"_s),
+                            jsFunctionNodeModuleCreateRequire,
+                            ImplementationVisibility::Public));
+
+  append(Identifier::fromString(vm, "paths"_s),
+         JSFunction::create(vm, globalObject, 1, String("paths"_s),
+                            Resolver__nodeModulePathsForJS,
+                            ImplementationVisibility::Public));
+
+  append(Identifier::fromString(vm, "findSourceMap"_s),
+         JSFunction::create(vm, globalObject, 1, String("findSourceMap"_s),
+                            jsFunctionFindSourceMap,
+                            ImplementationVisibility::Public));
+  append(Identifier::fromString(vm, "syncBuiltinExports"_s),
+         JSFunction::create(vm, globalObject, 0, String("syncBuiltinExports"_s),
+                            jsFunctionSyncBuiltinExports,
+                            ImplementationVisibility::Public));
+  append(Identifier::fromString(vm, "SourceMap"_s),
+         JSFunction::create(vm, globalObject, 1, String("SourceMap"_s),
+                            jsFunctionSourceMap,
+                            ImplementationVisibility::Public, NoIntrinsic,
+                            jsFunctionSourceMap, nullptr));
+
+  append(JSC::Identifier::fromString(vm, "_resolveFilename"_s),
+         JSFunction::create(vm, globalObject, 3, String("_resolveFilename"_s),
+                            jsFunctionResolveFileName,
+                            ImplementationVisibility::Public));
+
+  append(JSC::Identifier::fromString(vm, "_nodeModulePaths"_s),
+         JSFunction::create(vm, globalObject, 0, String("_nodeModulePaths"_s),
+                            Resolver__nodeModulePathsForJS,
+                            ImplementationVisibility::Public));
+
+  append(JSC::Identifier::fromString(vm, "_cache"_s),
+         jsCast<Zig::GlobalObject *>(globalObject)->lazyRequireCacheObject());
+
+  append(JSC::Identifier::fromString(vm, "globalPaths"_s),
+         JSC::constructEmptyArray(globalObject, nullptr, 0));
+
+  append(JSC::Identifier::fromString(vm, "prototype"_s),
+         JSC::constructEmptyObject(globalObject));
 
   JSC::JSArray *builtinModules = JSC::JSArray::create(
       vm,
@@ -184,6 +173,15 @@ void generateNodeModuleModule(JSC::JSGlobalObject *globalObject,
                                  JSC::jsString(vm, String("bun:ffi"_s)));
   builtinModules->putDirectIndex(globalObject, 6,
                                  JSC::jsString(vm, String("bun:sqlite"_s)));
-  exportValues.append(builtinModules);
+
+  append(JSC::Identifier::fromString(vm, "builtinModules"_s), builtinModules);
+
+  defaultObject->putDirect(vm,
+                           JSC::PropertyName(Identifier::fromUid(
+                               vm.symbolRegistry().symbolForKey("CommonJS"_s))),
+                           jsNumber(0), 0);
+
+  exportNames.append(vm.propertyNames->defaultKeyword);
+  exportValues.append(defaultObject);
 }
 } // namespace Zig
diff --git a/src/bun.js/modules/ProcessModule.h b/src/bun.js/modules/ProcessModule.h
index 3c9c3261f..fab0298ae 100644
--- a/src/bun.js/modules/ProcessModule.h
+++ b/src/bun.js/modules/ProcessModule.h
@@ -44,32 +44,37 @@ inline void generateProcessSourceCode(JSC::JSGlobalObject *lexicalGlobalObject,
       reinterpret_cast<GlobalObject *>(lexicalGlobalObject);
 
   JSC::JSObject *process = globalObject->processObject();
+  auto scope = DECLARE_THROW_SCOPE(vm);
+  if (!process->staticPropertiesReified()) {
+    process->reifyAllStaticProperties(globalObject);
+    if (scope.exception())
+      return;
+  }
 
   PropertyNameArray properties(vm, PropertyNameMode::Strings,
                                PrivateSymbolMode::Exclude);
   process->getPropertyNames(globalObject, properties,
                             DontEnumPropertiesMode::Exclude);
+  if (scope.exception())
+    return;
+
+  exportNames.append(vm.propertyNames->defaultKeyword);
+  exportValues.append(process);
 
-  exportNames.append(JSC::Identifier::fromString(vm, "default"_s));
-  JSFunction *processModuleCommonJS = JSFunction::create(
-      vm, globalObject, 0, "process"_s, jsFunctionProcessModuleCommonJS,
-      ImplementationVisibility::Public);
-  processModuleCommonJS->putDirect(
-      vm,
-      PropertyName(
-          Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s))),
-      jsBoolean(true), 0);
-  exportValues.append(processModuleCommonJS);
+  exportNames.append(
+      Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s)));
+  exportValues.append(jsNumber(0));
 
   for (auto &entry : properties) {
     exportNames.append(entry);
-    exportValues.append(process->get(globalObject, entry));
-    processModuleCommonJS->putDirectCustomAccessor(
-        vm, entry,
-        JSC::CustomGetterSetter::create(vm,
-                                        jsFunctionProcessModuleCommonJSGetter,
-                                        jsFunctionProcessModuleCommonJSSetter),
-        0);
+    auto catchScope = DECLARE_CATCH_SCOPE(vm);
+    JSValue result = process->get(globalObject, entry);
+    if (catchScope.exception()) {
+      result = jsUndefined();
+      catchScope.clearException();
+    }
+
+    exportValues.append(result);
   }
 }
 
diff --git a/src/bun.js/modules/StringDecoderModule.h b/src/bun.js/modules/StringDecoderModule.h
index c3b5f57bb..1dbf5ef8e 100644
--- a/src/bun.js/modules/StringDecoderModule.h
+++ b/src/bun.js/modules/StringDecoderModule.h
@@ -16,11 +16,11 @@ generateStringDecoderSourceCode(JSC::JSGlobalObject *lexicalGlobalObject,
   exportNames.append(JSC::Identifier::fromString(vm, "StringDecoder"_s));
   exportValues.append(globalObject->JSStringDecoder());
 
+  auto CommonJS =
+      Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s));
+
   JSC::JSObject *defaultObject = constructEmptyObject(globalObject);
-  defaultObject->putDirect(vm,
-                           PropertyName(Identifier::fromUid(
-                               vm.symbolRegistry().symbolForKey("CommonJS"_s))),
-                           jsNumber(0), 0);
+  defaultObject->putDirect(vm, PropertyName(CommonJS), jsNumber(0), 0);
 
   for (size_t i = 0; i < exportNames.size(); i++) {
     defaultObject->putDirect(vm, exportNames[i], exportValues.at(i), 0);
@@ -28,6 +28,9 @@ generateStringDecoderSourceCode(JSC::JSGlobalObject *lexicalGlobalObject,
 
   exportNames.append(vm.propertyNames->defaultKeyword);
   exportValues.append(defaultObject);
+
+  exportNames.append(CommonJS);
+  exportValues.append(jsNumber(0));
 }
 
 } // namespace Zig
diff --git a/src/bun.js/modules/TTYModule.h b/src/bun.js/modules/TTYModule.h
index 423268b32..79bc8c871 100644
--- a/src/bun.js/modules/TTYModule.h
+++ b/src/bun.js/modules/TTYModule.h
@@ -62,17 +62,20 @@ inline void generateTTYSourceCode(JSC::JSGlobalObject *lexicalGlobalObject,
   tty->putDirect(vm, JSC::Identifier::fromString(vm, "WriteStream"_s), notimpl);
   exportValues.append(notimpl);
 
-  tty->putDirect(vm,
-                 PropertyName(Identifier::fromUid(
-                     vm.symbolRegistry().symbolForKey("CommonJS"_s))),
-                 jsNumber(0), 0);
-
   for (size_t i = 0; i < exportNames.size(); i++) {
     tty->putDirect(vm, exportNames[i], exportValues.at(i), 0);
   }
 
   exportNames.append(vm.propertyNames->defaultKeyword);
   exportValues.append(tty);
+
+  auto CommonJS =
+      Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s));
+
+  exportNames.append(CommonJS);
+  exportValues.append(jsNumber(0));
+
+  tty->putDirect(vm, PropertyName(CommonJS), jsNumber(0), 0);
 }
 
 } // namespace Zig
diff --git a/src/bun.js/node/buffer.zig b/src/bun.js/node/buffer.zig
index f73498069..3a0750f05 100644
--- a/src/bun.js/node/buffer.zig
+++ b/src/bun.js/node/buffer.zig
@@ -50,7 +50,7 @@ pub const BufferVectorized = struct {
 
         switch (written) {
             0 => {},
-            1 => @memset(buf.ptr, buf[0], buf.len),
+            1 => @memset(buf, buf[0]),
             else => {
                 var contents = buf[0..written];
                 buf = buf[written..];
diff --git a/src/bun.js/node/dir_iterator.zig b/src/bun.js/node/dir_iterator.zig
index aa939679c..dac78e5e2 100644
--- a/src/bun.js/node/dir_iterator.zig
+++ b/src/bun.js/node/dir_iterator.zig
@@ -78,15 +78,15 @@ pub const Iterator = switch (builtin.os.tag) {
                 }
 
                 const entry_kind = switch (darwin_entry.d_type) {
-                    os.DT.BLK => Entry.Kind.BlockDevice,
-                    os.DT.CHR => Entry.Kind.CharacterDevice,
-                    os.DT.DIR => Entry.Kind.Directory,
-                    os.DT.FIFO => Entry.Kind.NamedPipe,
-                    os.DT.LNK => Entry.Kind.SymLink,
-                    os.DT.REG => Entry.Kind.File,
-                    os.DT.SOCK => Entry.Kind.UnixDomainSocket,
-                    os.DT.WHT => Entry.Kind.Whiteout,
-                    else => Entry.Kind.Unknown,
+                    os.DT.BLK => Entry.Kind.block_device,
+                    os.DT.CHR => Entry.Kind.character_device,
+                    os.DT.DIR => Entry.Kind.directory,
+                    os.DT.FIFO => Entry.Kind.named_pipe,
+                    os.DT.LNK => Entry.Kind.sym_link,
+                    os.DT.REG => Entry.Kind.file,
+                    os.DT.SOCK => Entry.Kind.unix_domain_socket,
+                    os.DT.WHT => Entry.Kind.whiteout,
+                    else => Entry.Kind.unknown,
                 };
                 return .{
                     .result = IteratorResult{
@@ -134,14 +134,14 @@ pub const Iterator = switch (builtin.os.tag) {
                 }
 
                 const entry_kind = switch (linux_entry.d_type) {
-                    linux.DT.BLK => Entry.Kind.BlockDevice,
-                    linux.DT.CHR => Entry.Kind.CharacterDevice,
-                    linux.DT.DIR => Entry.Kind.Directory,
-                    linux.DT.FIFO => Entry.Kind.NamedPipe,
-                    linux.DT.LNK => Entry.Kind.SymLink,
-                    linux.DT.REG => Entry.Kind.File,
-                    linux.DT.SOCK => Entry.Kind.UnixDomainSocket,
-                    else => Entry.Kind.Unknown,
+                    linux.DT.BLK => Entry.Kind.block_device,
+                    linux.DT.CHR => Entry.Kind.character_device,
+                    linux.DT.DIR => Entry.Kind.directory,
+                    linux.DT.FIFO => Entry.Kind.named_pipe,
+                    linux.DT.LNK => Entry.Kind.sym_link,
+                    linux.DT.REG => Entry.Kind.file,
+                    linux.DT.SOCK => Entry.Kind.unix_domain_socket,
+                    else => Entry.Kind.unknown,
                 };
                 return .{
                     .result = IteratorResult{
@@ -213,9 +213,9 @@ pub const Iterator = switch (builtin.os.tag) {
                 const name_utf8 = self.name_data[0..name_utf8_len];
                 const kind = blk: {
                     const attrs = dir_info.FileAttributes;
-                    if (attrs & w.FILE_ATTRIBUTE_DIRECTORY != 0) break :blk Entry.Kind.Directory;
-                    if (attrs & w.FILE_ATTRIBUTE_REPARSE_POINT != 0) break :blk Entry.Kind.SymLink;
-                    break :blk Entry.Kind.File;
+                    if (attrs & w.FILE_ATTRIBUTE_DIRECTORY != 0) break :blk Entry.Kind.directory;
+                    if (attrs & w.FILE_ATTRIBUTE_REPARSE_POINT != 0) break :blk Entry.Kind.sym_link;
+                    break :blk Entry.Kind.file;
                 };
                 return .{
                     .result = IteratorResult{
@@ -275,13 +275,13 @@ pub const Iterator = switch (builtin.os.tag) {
                 }
 
                 const entry_kind = switch (entry.d_type) {
-                    .BLOCK_DEVICE => Entry.Kind.BlockDevice,
-                    .CHARACTER_DEVICE => Entry.Kind.CharacterDevice,
-                    .DIRECTORY => Entry.Kind.Directory,
-                    .SYMBOLIC_LINK => Entry.Kind.SymLink,
-                    .REGULAR_FILE => Entry.Kind.File,
-                    .SOCKET_STREAM, .SOCKET_DGRAM => Entry.Kind.UnixDomainSocket,
-                    else => Entry.Kind.Unknown,
+                    .BLOCK_DEVICE => Entry.Kind.block_device,
+                    .CHARACTER_DEVICE => Entry.Kind.character_device,
+                    .DIRECTORY => Entry.Kind.directory,
+                    .SYMBOLIC_LINK => Entry.Kind.sym_link,
+                    .REGULAR_FILE => Entry.Kind.file,
+                    .SOCKET_STREAM, .SOCKET_DGRAM => Entry.Kind.unix_domain_socket,
+                    else => Entry.Kind.unknown,
                 };
                 return IteratorResult{
                     .name = name,
diff --git a/src/bun.js/node/fs_events.zig b/src/bun.js/node/fs_events.zig
new file mode 100644
index 000000000..a3fba5441
--- /dev/null
+++ b/src/bun.js/node/fs_events.zig
@@ -0,0 +1,609 @@
+const std = @import("std");
+const bun = @import("root").bun;
+const Environment = bun.Environment;
+const Mutex = @import("../../lock.zig").Lock;
+const sync = @import("../../sync.zig");
+const Semaphore = sync.Semaphore;
+const UnboundedQueue = @import("../unbounded_queue.zig").UnboundedQueue;
+const TaggedPointerUnion = @import("../../tagged_pointer.zig").TaggedPointerUnion;
+const string = bun.string;
+
+pub const CFAbsoluteTime = f64;
+pub const CFTimeInterval = f64;
+pub const CFArrayCallBacks = anyopaque;
+
+pub const FSEventStreamEventFlags = c_int;
+pub const OSStatus = c_int;
+pub const CFIndex = c_long;
+
+pub const FSEventStreamCreateFlags = u32;
+pub const FSEventStreamEventId = u64;
+
+pub const CFStringEncoding = c_uint;
+
+pub const CFArrayRef = ?*anyopaque;
+pub const CFAllocatorRef = ?*anyopaque;
+pub const CFBundleRef = ?*anyopaque;
+pub const CFDictionaryRef = ?*anyopaque;
+pub const CFRunLoopRef = ?*anyopaque;
+pub const CFRunLoopSourceRef = ?*anyopaque;
+pub const CFStringRef = ?*anyopaque;
+pub const CFTypeRef = ?*anyopaque;
+pub const FSEventStreamRef = ?*anyopaque;
+pub const FSEventStreamCallback = *const fn (FSEventStreamRef, ?*anyopaque, usize, ?*anyopaque, *FSEventStreamEventFlags, *FSEventStreamEventId) callconv(.C) void;
+
+// we only care about info and perform
+pub const CFRunLoopSourceContext = extern struct {
+    version: CFIndex = 0,
+    info: *anyopaque,
+    retain: ?*anyopaque = null,
+    release: ?*anyopaque = null,
+    copyDescription: ?*anyopaque = null,
+    equal: ?*anyopaque = null,
+    hash: ?*anyopaque = null,
+    schedule: ?*anyopaque = null,
+    cancel: ?*anyopaque = null,
+    perform: *const fn (?*anyopaque) callconv(.C) void,
+};
+
+pub const FSEventStreamContext = extern struct {
+    version: CFIndex = 0,
+    info: ?*anyopaque = null,
+    pad: [3]?*anyopaque = .{ null, null, null },
+};
+
+pub const kCFStringEncodingUTF8: CFStringEncoding = 0x8000100;
+pub const noErr: OSStatus = 0;
+
+pub const kFSEventStreamCreateFlagNoDefer: c_int = 2;
+pub const kFSEventStreamCreateFlagFileEvents: c_int = 16;
+
+pub const kFSEventStreamEventFlagEventIdsWrapped: c_int = 8;
+pub const kFSEventStreamEventFlagHistoryDone: c_int = 16;
+pub const kFSEventStreamEventFlagItemChangeOwner: c_int = 0x4000;
+pub const kFSEventStreamEventFlagItemCreated: c_int = 0x100;
+pub const kFSEventStreamEventFlagItemFinderInfoMod: c_int = 0x2000;
+pub const kFSEventStreamEventFlagItemInodeMetaMod: c_int = 0x400;
+pub const kFSEventStreamEventFlagItemIsDir: c_int = 0x20000;
+pub const kFSEventStreamEventFlagItemModified: c_int = 0x1000;
+pub const kFSEventStreamEventFlagItemRemoved: c_int = 0x200;
+pub const kFSEventStreamEventFlagItemRenamed: c_int = 0x800;
+pub const kFSEventStreamEventFlagItemXattrMod: c_int = 0x8000;
+pub const kFSEventStreamEventFlagKernelDropped: c_int = 4;
+pub const kFSEventStreamEventFlagMount: c_int = 64;
+pub const kFSEventStreamEventFlagRootChanged: c_int = 32;
+pub const kFSEventStreamEventFlagUnmount: c_int = 128;
+pub const kFSEventStreamEventFlagUserDropped: c_int = 2;
+
+// Lazy function call binding.
+const RTLD_LAZY = 0x1;
+// Symbols exported from this image (dynamic library or bundle)
+// are generally hidden and only availble to dlsym() when
+// directly using the handle returned by this call to dlopen().
+const RTLD_LOCAL = 0x4;
+
+pub const kFSEventsModified: c_int =
+    kFSEventStreamEventFlagItemChangeOwner |
+    kFSEventStreamEventFlagItemFinderInfoMod |
+    kFSEventStreamEventFlagItemInodeMetaMod |
+    kFSEventStreamEventFlagItemModified |
+    kFSEventStreamEventFlagItemXattrMod;
+
+pub const kFSEventsRenamed: c_int =
+    kFSEventStreamEventFlagItemCreated |
+    kFSEventStreamEventFlagItemRemoved |
+    kFSEventStreamEventFlagItemRenamed;
+
+pub const kFSEventsSystem: c_int =
+    kFSEventStreamEventFlagUserDropped |
+    kFSEventStreamEventFlagKernelDropped |
+    kFSEventStreamEventFlagEventIdsWrapped |
+    kFSEventStreamEventFlagHistoryDone |
+    kFSEventStreamEventFlagMount |
+    kFSEventStreamEventFlagUnmount |
+    kFSEventStreamEventFlagRootChanged;
+
+var fsevents_mutex: Mutex = Mutex.init();
+var fsevents_default_loop_mutex: Mutex = Mutex.init();
+var fsevents_default_loop: ?*FSEventsLoop = null;
+
+fn dlsym(handle: ?*anyopaque, comptime Type: type, comptime symbol: [:0]const u8) ?Type {
+    if (std.c.dlsym(handle, symbol)) |ptr| {
+        return bun.cast(Type, ptr);
+    }
+    return null;
+}
+
+pub const CoreFoundation = struct {
+    handle: ?*anyopaque,
+    ArrayCreate: *fn (CFAllocatorRef, [*]?*anyopaque, CFIndex, ?*CFArrayCallBacks) callconv(.C) CFArrayRef,
+    Release: *fn (CFTypeRef) callconv(.C) void,
+
+    RunLoopAddSource: *fn (CFRunLoopRef, CFRunLoopSourceRef, CFStringRef) callconv(.C) void,
+    RunLoopGetCurrent: *fn () callconv(.C) CFRunLoopRef,
+    RunLoopRemoveSource: *fn (CFRunLoopRef, CFRunLoopSourceRef, CFStringRef) callconv(.C) void,
+    RunLoopRun: *fn () callconv(.C) void,
+    RunLoopSourceCreate: *fn (CFAllocatorRef, CFIndex, *CFRunLoopSourceContext) callconv(.C) CFRunLoopSourceRef,
+    RunLoopSourceSignal: *fn (CFRunLoopSourceRef) callconv(.C) void,
+    RunLoopStop: *fn (CFRunLoopRef) callconv(.C) void,
+    RunLoopWakeUp: *fn (CFRunLoopRef) callconv(.C) void,
+    StringCreateWithFileSystemRepresentation: *fn (CFAllocatorRef, [*]const u8) callconv(.C) CFStringRef,
+    RunLoopDefaultMode: *CFStringRef,
+
+    pub fn get() CoreFoundation {
+        if (fsevents_cf) |cf| return cf;
+        fsevents_mutex.lock();
+        defer fsevents_mutex.unlock();
+        if (fsevents_cf) |cf| return cf;
+
+        InitLibrary();
+
+        return fsevents_cf.?;
+    }
+
+    // We Actually never deinit it
+    // pub fn deinit(this: *CoreFoundation) void {
+    //     if(this.handle) | ptr| {
+    //         this.handle = null;
+    //         _  = std.c.dlclose(this.handle);
+    //     }
+    // }
+
+};
+
+pub const CoreServices = struct {
+    handle: ?*anyopaque,
+    FSEventStreamCreate: *fn (CFAllocatorRef, FSEventStreamCallback, *FSEventStreamContext, CFArrayRef, FSEventStreamEventId, CFTimeInterval, FSEventStreamCreateFlags) callconv(.C) FSEventStreamRef,
+    FSEventStreamInvalidate: *fn (FSEventStreamRef) callconv(.C) void,
+    FSEventStreamRelease: *fn (FSEventStreamRef) callconv(.C) void,
+    FSEventStreamScheduleWithRunLoop: *fn (FSEventStreamRef, CFRunLoopRef, CFStringRef) callconv(.C) void,
+    FSEventStreamStart: *fn (FSEventStreamRef) callconv(.C) c_int,
+    FSEventStreamStop: *fn (FSEventStreamRef) callconv(.C) void,
+    // libuv set it to -1 so the actual value is this
+    kFSEventStreamEventIdSinceNow: FSEventStreamEventId = 18446744073709551615,
+
+    pub fn get() CoreServices {
+        if (fsevents_cs) |cs| return cs;
+        fsevents_mutex.lock();
+        defer fsevents_mutex.unlock();
+        if (fsevents_cs) |cs| return cs;
+
+        InitLibrary();
+
+        return fsevents_cs.?;
+    }
+
+    // We Actually never deinit it
+    // pub fn deinit(this: *CoreServices) void {
+    //     if(this.handle) | ptr| {
+    //         this.handle = null;
+    //         _  = std.c.dlclose(this.handle);
+    //     }
+    // }
+
+};
+
+var fsevents_cf: ?CoreFoundation = null;
+var fsevents_cs: ?CoreServices = null;
+
+fn InitLibrary() void {
+    const fsevents_cf_handle = std.c.dlopen("/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation", RTLD_LAZY | RTLD_LOCAL);
+    if (fsevents_cf_handle == null) @panic("Cannot Load CoreFoundation");
+
+    fsevents_cf = CoreFoundation{
+        .handle = fsevents_cf_handle,
+        .ArrayCreate = dlsym(fsevents_cf_handle, *fn (CFAllocatorRef, [*]?*anyopaque, CFIndex, ?*CFArrayCallBacks) callconv(.C) CFArrayRef, "CFArrayCreate") orelse @panic("Cannot Load CoreFoundation"),
+        .Release = dlsym(fsevents_cf_handle, *fn (CFTypeRef) callconv(.C) void, "CFRelease") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopAddSource = dlsym(fsevents_cf_handle, *fn (CFRunLoopRef, CFRunLoopSourceRef, CFStringRef) callconv(.C) void, "CFRunLoopAddSource") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopGetCurrent = dlsym(fsevents_cf_handle, *fn () callconv(.C) CFRunLoopRef, "CFRunLoopGetCurrent") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopRemoveSource = dlsym(fsevents_cf_handle, *fn (CFRunLoopRef, CFRunLoopSourceRef, CFStringRef) callconv(.C) void, "CFRunLoopRemoveSource") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopRun = dlsym(fsevents_cf_handle, *fn () callconv(.C) void, "CFRunLoopRun") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopSourceCreate = dlsym(fsevents_cf_handle, *fn (CFAllocatorRef, CFIndex, *CFRunLoopSourceContext) callconv(.C) CFRunLoopSourceRef, "CFRunLoopSourceCreate") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopSourceSignal = dlsym(fsevents_cf_handle, *fn (CFRunLoopSourceRef) callconv(.C) void, "CFRunLoopSourceSignal") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopStop = dlsym(fsevents_cf_handle, *fn (CFRunLoopRef) callconv(.C) void, "CFRunLoopStop") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopWakeUp = dlsym(fsevents_cf_handle, *fn (CFRunLoopRef) callconv(.C) void, "CFRunLoopWakeUp") orelse @panic("Cannot Load CoreFoundation"),
+        .StringCreateWithFileSystemRepresentation = dlsym(fsevents_cf_handle, *fn (CFAllocatorRef, [*]const u8) callconv(.C) CFStringRef, "CFStringCreateWithFileSystemRepresentation") orelse @panic("Cannot Load CoreFoundation"),
+        .RunLoopDefaultMode = dlsym(fsevents_cf_handle, *CFStringRef, "kCFRunLoopDefaultMode") orelse @panic("Cannot Load CoreFoundation"),
+    };
+
+    const fsevents_cs_handle = std.c.dlopen("/System/Library/Frameworks/CoreServices.framework/Versions/A/CoreServices", RTLD_LAZY | RTLD_LOCAL);
+    if (fsevents_cs_handle == null) @panic("Cannot Load CoreServices");
+
+    fsevents_cs = CoreServices{
+        .handle = fsevents_cs_handle,
+        .FSEventStreamCreate = dlsym(fsevents_cs_handle, *fn (CFAllocatorRef, FSEventStreamCallback, *FSEventStreamContext, CFArrayRef, FSEventStreamEventId, CFTimeInterval, FSEventStreamCreateFlags) callconv(.C) FSEventStreamRef, "FSEventStreamCreate") orelse @panic("Cannot Load CoreServices"),
+        .FSEventStreamInvalidate = dlsym(fsevents_cs_handle, *fn (FSEventStreamRef) callconv(.C) void, "FSEventStreamInvalidate") orelse @panic("Cannot Load CoreServices"),
+        .FSEventStreamRelease = dlsym(fsevents_cs_handle, *fn (FSEventStreamRef) callconv(.C) void, "FSEventStreamRelease") orelse @panic("Cannot Load CoreServices"),
+        .FSEventStreamScheduleWithRunLoop = dlsym(fsevents_cs_handle, *fn (FSEventStreamRef, CFRunLoopRef, CFStringRef) callconv(.C) void, "FSEventStreamScheduleWithRunLoop") orelse @panic("Cannot Load CoreServices"),
+        .FSEventStreamStart = dlsym(fsevents_cs_handle, *fn (FSEventStreamRef) callconv(.C) c_int, "FSEventStreamStart") orelse @panic("Cannot Load CoreServices"),
+        .FSEventStreamStop = dlsym(fsevents_cs_handle, *fn (FSEventStreamRef) callconv(.C) void, "FSEventStreamStop") orelse @panic("Cannot Load CoreServices"),
+    };
+}
+
+pub const FSEventsLoop = struct {
+    signal_source: CFRunLoopSourceRef,
+    mutex: Mutex,
+    loop: CFRunLoopRef = null,
+    sem: Semaphore,
+    thread: std.Thread = undefined,
+    tasks: ConcurrentTask.Queue = ConcurrentTask.Queue{},
+    watchers: bun.BabyList(?*FSEventsWatcher) = .{},
+    watcher_count: u32 = 0,
+    fsevent_stream: FSEventStreamRef = null,
+    paths: ?[]?*anyopaque = null,
+    cf_paths: CFArrayRef = null,
+    has_scheduled_watchers: bool = false,
+
+    pub const Task = struct {
+        ctx: ?*anyopaque,
+        callback: *const (fn (*anyopaque) void),
+
+        pub fn run(this: *Task) void {
+            var callback = this.callback;
+            var ctx = this.ctx;
+            callback(ctx.?);
+        }
+
+        pub fn New(comptime Type: type, comptime Callback: anytype) type {
+            return struct {
+                pub fn init(ctx: *Type) Task {
+                    return Task{
+                        .callback = wrap,
+                        .ctx = ctx,
+                    };
+                }
+
+                pub fn wrap(this: ?*anyopaque) void {
+                    @call(.always_inline, Callback, .{@ptrCast(*Type, @alignCast(@alignOf(Type), this.?))});
+                }
+            };
+        }
+    };
+
+    pub const ConcurrentTask = struct {
+        task: Task = undefined,
+        next: ?*ConcurrentTask = null,
+        auto_delete: bool = false,
+
+        pub const Queue = UnboundedQueue(ConcurrentTask, .next);
+
+        pub fn from(this: *ConcurrentTask, task: Task) *ConcurrentTask {
+            this.* = .{
+                .task = task,
+                .next = null,
+            };
+            return this;
+        }
+    };
+
+    pub fn CFThreadLoop(this: *FSEventsLoop) void {
+        bun.Output.Source.configureNamedThread("CFThreadLoop");
+
+        const CF = CoreFoundation.get();
+
+        this.loop = CF.RunLoopGetCurrent();
+
+        CF.RunLoopAddSource(this.loop, this.signal_source, CF.RunLoopDefaultMode.*);
+
+        this.sem.post();
+
+        CF.RunLoopRun();
+        CF.RunLoopRemoveSource(this.loop, this.signal_source, CF.RunLoopDefaultMode.*);
+
+        this.loop = null;
+    }
+
+    // Runs in CF thread, executed after `enqueueTaskConcurrent()`
+    fn CFLoopCallback(arg: ?*anyopaque) callconv(.C) void {
+        if (arg) |self| {
+            const this = bun.cast(*FSEventsLoop, self);
+
+            var concurrent = this.tasks.popBatch();
+            const count = concurrent.count;
+            if (count == 0)
+                return;
+
+            var iter = concurrent.iterator();
+            while (iter.next()) |task| {
+                task.task.run();
+                if (task.auto_delete) bun.default_allocator.destroy(task);
+            }
+        }
+    }
+
+    pub fn init() !*FSEventsLoop {
+        const this = bun.default_allocator.create(FSEventsLoop) catch unreachable;
+
+        const CF = CoreFoundation.get();
+
+        var ctx = CFRunLoopSourceContext{
+            .info = this,
+            .perform = CFLoopCallback,
+        };
+
+        const signal_source = CF.RunLoopSourceCreate(null, 0, &ctx);
+        if (signal_source == null) {
+            return error.FailedToCreateCoreFoudationSourceLoop;
+        }
+
+        var fs_loop = FSEventsLoop{ .sem = Semaphore.init(0), .mutex = Mutex.init(), .signal_source = signal_source };
+
+        this.* = fs_loop;
+        this.thread = try std.Thread.spawn(.{}, FSEventsLoop.CFThreadLoop, .{this});
+
+        // sync threads
+        this.sem.wait();
+        return this;
+    }
+
+    fn enqueueTaskConcurrent(this: *FSEventsLoop, task: Task) void {
+        const CF = CoreFoundation.get();
+        var concurrent = bun.default_allocator.create(ConcurrentTask) catch unreachable;
+        concurrent.auto_delete = true;
+        this.tasks.push(concurrent.from(task));
+        CF.RunLoopSourceSignal(this.signal_source);
+        CF.RunLoopWakeUp(this.loop);
+    }
+
+    // Runs in CF thread, when there're events in FSEventStream
+    fn _events_cb(_: FSEventStreamRef, info: ?*anyopaque, numEvents: usize, eventPaths: ?*anyopaque, eventFlags: *FSEventStreamEventFlags, _: *FSEventStreamEventId) callconv(.C) void {
+        const paths_ptr = bun.cast([*][*:0]const u8, eventPaths);
+        const paths = paths_ptr[0..numEvents];
+        var loop = bun.cast(*FSEventsLoop, info);
+        const event_flags = bun.cast([*]FSEventStreamEventFlags, eventFlags);
+
+        for (loop.watchers.slice()) |watcher| {
+            if (watcher) |handle| {
+                for (paths, 0..) |path_ptr, i| {
+                    var flags = event_flags[i];
+                    var path = path_ptr[0..bun.len(path_ptr)];
+                    // Filter out paths that are outside handle's request
+                    if (path.len < handle.path.len or !bun.strings.startsWith(path, handle.path)) {
+                        continue;
+                    }
+                    const is_file = (flags & kFSEventStreamEventFlagItemIsDir) == 0;
+
+                    // Remove common prefix, unless the watched folder is "/"
+                    if (!(handle.path.len == 1 and handle.path[0] == '/')) {
+                        path = path[handle.path.len..];
+
+                        // Ignore events with path equal to directory itself
+                        if (path.len <= 1 and is_file) {
+                            continue;
+                        }
+                        if (path.len == 0) {
+                            // Since we're using fsevents to watch the file itself, path == handle.path, and we now need to get the basename of the file back
+                            while (path.len > 0) {
+                                if (bun.strings.startsWithChar(path, '/')) {
+                                    path = path[1..];
+                                    break;
+                                } else {
+                                    path = path[1..];
+                                }
+                            }
+
+                            // Created and Removed seem to be always set, but don't make sense
+                            flags &= ~kFSEventsRenamed;
+                        } else {
+                            // Skip forward slash
+                            path = path[1..];
+                        }
+                    }
+
+                    // Do not emit events from subdirectories (without option set)
+                    if (path.len == 0 or (bun.strings.containsChar(path, '/') and !handle.recursive)) {
+                        continue;
+                    }
+
+                    var is_rename = true;
+
+                    if ((flags & kFSEventsRenamed) == 0) {
+                        if ((flags & kFSEventsModified) != 0 or is_file) {
+                            is_rename = false;
+                        }
+                    }
+
+                    handle.callback(handle.ctx, path, is_file, is_rename);
+                }
+            }
+        }
+    }
+
+    // Runs on CF Thread
+    pub fn _schedule(this: *FSEventsLoop) void {
+        this.mutex.lock();
+        defer this.mutex.unlock();
+        this.has_scheduled_watchers = false;
+
+        var watchers = this.watchers.slice();
+
+        const CF = CoreFoundation.get();
+        const CS = CoreServices.get();
+
+        if (this.fsevent_stream) |stream| {
+            // Stop emitting events
+            CS.FSEventStreamStop(stream);
+
+            // Release stream
+            CS.FSEventStreamInvalidate(stream);
+            CS.FSEventStreamRelease(stream);
+            this.fsevent_stream = null;
+        }
+        // clean old paths
+        if (this.paths) |p| {
+            this.paths = null;
+            bun.default_allocator.destroy(p);
+        }
+        if (this.cf_paths) |cf| {
+            this.cf_paths = null;
+            CF.Release(cf);
+        }
+
+        const paths = bun.default_allocator.alloc(?*anyopaque, this.watcher_count) catch unreachable;
+        var count: u32 = 0;
+        for (watchers) |w| {
+            if (w) |watcher| {
+                const path = CF.StringCreateWithFileSystemRepresentation(null, watcher.path.ptr);
+                paths[count] = path;
+                count += 1;
+            }
+        }
+
+        const cf_paths = CF.ArrayCreate(null, paths.ptr, count, null);
+        var ctx: FSEventStreamContext = .{
+            .info = this,
+        };
+
+        const latency: CFAbsoluteTime = 0.05;
+        // Explanation of selected flags:
+        // 1. NoDefer - without this flag, events that are happening continuously
+        //    (i.e. each event is happening after time interval less than `latency`,
+        //    counted from previous event), will be deferred and passed to callback
+        //    once they'll either fill whole OS buffer, or when this continuous stream
+        //    will stop (i.e. there'll be delay between events, bigger than
+        //    `latency`).
+        //    Specifying this flag will invoke callback after `latency` time passed
+        //    since event.
+        // 2. FileEvents - fire callback for file changes too (by default it is firing
+        //    it only for directory changes).
+        //
+        const flags: FSEventStreamCreateFlags = kFSEventStreamCreateFlagNoDefer | kFSEventStreamCreateFlagFileEvents;
+
+        //
+        // NOTE: It might sound like a good idea to remember last seen StreamEventId,
+        // but in reality one dir might have last StreamEventId less than, the other,
+        // that is being watched now. Which will cause FSEventStream API to report
+        // changes to files from the past.
+        //
+        const ref = CS.FSEventStreamCreate(null, _events_cb, &ctx, cf_paths, CS.kFSEventStreamEventIdSinceNow, latency, flags);
+
+        CS.FSEventStreamScheduleWithRunLoop(ref, this.loop, CF.RunLoopDefaultMode.*);
+        if (CS.FSEventStreamStart(ref) == 0) {
+            //clean in case of failure
+            bun.default_allocator.destroy(paths);
+            CF.Release(cf_paths);
+            CS.FSEventStreamInvalidate(ref);
+            CS.FSEventStreamRelease(ref);
+            return;
+        }
+        this.fsevent_stream = ref;
+        this.paths = paths;
+        this.cf_paths = cf_paths;
+    }
+
+    fn registerWatcher(this: *FSEventsLoop, watcher: *FSEventsWatcher) void {
+        this.mutex.lock();
+        defer this.mutex.unlock();
+        if (this.watcher_count == this.watchers.len) {
+            this.watcher_count += 1;
+            this.watchers.push(bun.default_allocator, watcher) catch unreachable;
+        } else {
+            var watchers = this.watchers.slice();
+            for (watchers, 0..) |w, i| {
+                if (w == null) {
+                    watchers[i] = watcher;
+                    this.watcher_count += 1;
+                    break;
+                }
+            }
+        }
+
+        if (this.has_scheduled_watchers == false) {
+            this.has_scheduled_watchers = true;
+            this.enqueueTaskConcurrent(Task.New(FSEventsLoop, _schedule).init(this));
+        }
+    }
+
+    fn unregisterWatcher(this: *FSEventsLoop, watcher: *FSEventsWatcher) void {
+        this.mutex.lock();
+        defer this.mutex.unlock();
+        var watchers = this.watchers.slice();
+        for (watchers, 0..) |w, i| {
+            if (w) |item| {
+                if (item == watcher) {
+                    watchers[i] = null;
+                    // if is the last one just pop
+                    if (i == watchers.len - 1) {
+                        this.watchers.len -= 1;
+                    }
+                    this.watcher_count -= 1;
+                    break;
+                }
+            }
+        }
+    }
+
+    // Runs on CF loop to close the loop
+    fn _stop(this: *FSEventsLoop) void {
+        const CF = CoreFoundation.get();
+        CF.RunLoopStop(this.loop);
+    }
+    fn deinit(this: *FSEventsLoop) void {
+        // signal close and wait
+        this.enqueueTaskConcurrent(Task.New(FSEventsLoop, FSEventsLoop._stop).init(this));
+        this.thread.join();
+        const CF = CoreFoundation.get();
+
+        CF.Release(this.signal_source);
+        this.signal_source = null;
+
+        this.sem.deinit();
+        this.mutex.deinit();
+        if (this.watcher_count > 0) {
+            while (this.watchers.popOrNull()) |watcher| {
+                if (watcher) |w| {
+                    // unlink watcher
+                    w.loop = null;
+                }
+            }
+        }
+
+        this.watchers.deinitWithAllocator(bun.default_allocator);
+
+        bun.default_allocator.destroy(this);
+    }
+};
+
+pub const FSEventsWatcher = struct {
+    path: string,
+    callback: Callback,
+    loop: ?*FSEventsLoop,
+    recursive: bool,
+    ctx: ?*anyopaque,
+
+    const Callback = *const fn (ctx: ?*anyopaque, path: string, is_file: bool, is_rename: bool) void;
+
+    pub fn init(loop: *FSEventsLoop, path: string, recursive: bool, callback: Callback, ctx: ?*anyopaque) *FSEventsWatcher {
+        var this = bun.default_allocator.create(FSEventsWatcher) catch unreachable;
+        this.* = FSEventsWatcher{
+            .path = path,
+            .callback = callback,
+            .loop = loop,
+            .recursive = recursive,
+            .ctx = ctx,
+        };
+
+        loop.registerWatcher(this);
+        return this;
+    }
+
+    pub fn deinit(this: *FSEventsWatcher) void {
+        if (this.loop) |loop| {
+            loop.unregisterWatcher(this);
+        }
+        bun.default_allocator.destroy(this);
+    }
+};
+
+pub fn watch(path: string, recursive: bool, callback: FSEventsWatcher.Callback, ctx: ?*anyopaque) !*FSEventsWatcher {
+    if (fsevents_default_loop) |loop| {
+        return FSEventsWatcher.init(loop, path, recursive, callback, ctx);
+    } else {
+        fsevents_default_loop_mutex.lock();
+        defer fsevents_default_loop_mutex.unlock();
+        if (fsevents_default_loop == null) {
+            fsevents_default_loop = try FSEventsLoop.init();
+        }
+        return FSEventsWatcher.init(fsevents_default_loop.?, path, recursive, callback, ctx);
+    }
+}
diff --git a/src/bun.js/node/node.classes.ts b/src/bun.js/node/node.classes.ts
index f984077e4..2efad5245 100644
--- a/src/bun.js/node/node.classes.ts
+++ b/src/bun.js/node/node.classes.ts
@@ -2,6 +2,35 @@ import { define } from "../scripts/class-definitions";
 
 export default [
   define({
+    name: "FSWatcher",
+    construct: false,
+    noConstructor: true,
+    finalize: true,
+    configurable: false,
+    hasPendingActivity: true,
+    klass: {},
+    JSType: "0b11101110",
+    proto: {
+      ref: {
+        fn: "doRef",
+        length: 0,
+      },
+      unref: {
+        fn: "doUnref",
+        length: 0,
+      },
+      hasRef: {
+        fn: "hasRef",
+        length: 0,
+      },
+      close: {
+        fn: "doClose",
+        length: 0,
+      },
+    },
+    values: ["listener"],
+  }),
+  define({
     name: "Timeout",
     construct: false,
     noConstructor: true,
@@ -300,7 +329,7 @@ export default [
       utimes: { fn: "utimes", length: 4 },
       utimesSync: { fn: "utimesSync", length: 3 },
       // TODO:
-      // watch: { fn: "watch", length: 3 },
+      watch: { fn: "watch", length: 3 },
       // watchFile: { fn: "watchFile", length: 3 },
       writeFile: { fn: "writeFile", length: 4 },
       writeFileSync: { fn: "writeFileSync", length: 3 },
diff --git a/src/bun.js/node/node_fs.zig b/src/bun.js/node/node_fs.zig
index 254d58455..3f298c5c7 100644
--- a/src/bun.js/node/node_fs.zig
+++ b/src/bun.js/node/node_fs.zig
@@ -34,9 +34,8 @@ const Mode = JSC.Node.Mode;
 
 const uid_t = std.os.uid_t;
 const gid_t = std.os.gid_t;
-
 /// u63 to allow one null bit
-const ReadPosition = u63;
+const ReadPosition = i64;
 
 const Stats = JSC.Node.Stats;
 const Dirent = JSC.Node.Dirent;
@@ -136,6 +135,154 @@ pub const Arguments = struct {
         }
     };
 
+    pub const Writev = struct {
+        fd: FileDescriptor,
+        buffers: JSC.Node.VectorArrayBuffer,
+        position: ?u52 = 0,
+
+        pub fn deinit(_: *const @This()) void {}
+
+        pub fn fromJS(ctx: JSC.C.JSContextRef, arguments: *ArgumentsSlice, exception: JSC.C.ExceptionRef) ?Writev {
+            const fd_value = arguments.nextEat() orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "file descriptor is required",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            const fd = JSC.Node.fileDescriptorFromJS(ctx, fd_value, exception) orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "file descriptor must be a number",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            const buffers = JSC.Node.VectorArrayBuffer.fromJS(
+                ctx,
+                arguments.protectEatNext() orelse {
+                    JSC.throwInvalidArguments("Expected an ArrayBufferView[]", .{}, ctx, exception);
+                    return null;
+                },
+                exception,
+                arguments.arena.allocator(),
+            ) orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "buffers must be an array of TypedArray",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            var position: ?u52 = null;
+
+            if (arguments.nextEat()) |pos_value| {
+                if (!pos_value.isUndefinedOrNull()) {
+                    if (pos_value.isNumber()) {
+                        position = pos_value.to(u52);
+                    } else {
+                        JSC.throwInvalidArguments(
+                            "position must be a number",
+                            .{},
+                            ctx,
+                            exception,
+                        );
+                        return null;
+                    }
+                }
+            }
+
+            return Writev{ .fd = fd, .buffers = buffers, .position = position };
+        }
+    };
+
+    pub const Readv = struct {
+        fd: FileDescriptor,
+        buffers: JSC.Node.VectorArrayBuffer,
+        position: ?u52 = 0,
+
+        pub fn deinit(_: *const @This()) void {}
+
+        pub fn fromJS(ctx: JSC.C.JSContextRef, arguments: *ArgumentsSlice, exception: JSC.C.ExceptionRef) ?Readv {
+            const fd_value = arguments.nextEat() orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "file descriptor is required",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            const fd = JSC.Node.fileDescriptorFromJS(ctx, fd_value, exception) orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "file descriptor must be a number",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            const buffers = JSC.Node.VectorArrayBuffer.fromJS(
+                ctx,
+                arguments.protectEatNext() orelse {
+                    JSC.throwInvalidArguments("Expected an ArrayBufferView[]", .{}, ctx, exception);
+                    return null;
+                },
+                exception,
+                arguments.arena.allocator(),
+            ) orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "buffers must be an array of TypedArray",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            var position: ?u52 = null;
+
+            if (arguments.nextEat()) |pos_value| {
+                if (!pos_value.isUndefinedOrNull()) {
+                    if (pos_value.isNumber()) {
+                        position = pos_value.to(u52);
+                    } else {
+                        JSC.throwInvalidArguments(
+                            "position must be a number",
+                            .{},
+                            ctx,
+                            exception,
+                        );
+                        return null;
+                    }
+                }
+            }
+
+            return Readv{ .fd = fd, .buffers = buffers, .position = position };
+        }
+    };
+
     pub const FTruncate = struct {
         fd: FileDescriptor,
         len: ?JSC.WebCore.Blob.SizeType = null,
@@ -505,6 +652,7 @@ pub const Arguments = struct {
     pub const Stat = struct {
         path: PathLike,
         big_int: bool = false,
+        throw_if_no_entry: bool = true,
 
         pub fn deinit(this: Stat) void {
             this.path.deinit();
@@ -525,13 +673,25 @@ pub const Arguments = struct {
 
             if (exception.* != null) return null;
 
+            var throw_if_no_entry = true;
+
             const big_int = brk: {
                 if (arguments.next()) |next_val| {
                     if (next_val.isObject()) {
                         if (next_val.isCallable(ctx.ptr().vm())) break :brk false;
                         arguments.eat();
 
-                        if (next_val.getOptional(ctx.ptr(), "bigint", bool) catch false) |big_int| {
+                        if (next_val.getOptional(ctx.ptr(), "throwIfNoEntry", bool) catch {
+                            path.deinit();
+                            return null;
+                        }) |throw_if_no_entry_val| {
+                            throw_if_no_entry = throw_if_no_entry_val;
+                        }
+
+                        if (next_val.getOptional(ctx.ptr(), "bigint", bool) catch {
+                            path.deinit();
+                            return null;
+                        }) |big_int| {
                             break :brk big_int;
                         }
                     }
@@ -541,7 +701,7 @@ pub const Arguments = struct {
 
             if (exception.* != null) return null;
 
-            return Stat{ .path = path, .big_int = big_int };
+            return Stat{ .path = path, .big_int = big_int, .throw_if_no_entry = throw_if_no_entry };
         }
     };
 
@@ -1377,7 +1537,7 @@ pub const Arguments = struct {
                         // fs.write(fd, string[, position[, encoding]], callback)
                         .string => {
                             if (current.isNumber()) {
-                                args.position = current.toU32();
+                                args.position = current.to(i52);
                                 arguments.eat();
                                 current = arguments.next() orelse break :parse;
                             }
@@ -1393,18 +1553,18 @@ pub const Arguments = struct {
                                 break :parse;
                             }
 
-                            if (!current.isNumber()) break :parse;
-                            args.offset = current.toU32();
+                            if (!(current.isNumber() or current.isBigInt())) break :parse;
+                            args.offset = current.to(u52);
                             arguments.eat();
                             current = arguments.next() orelse break :parse;
 
-                            if (!current.isNumber()) break :parse;
-                            args.length = current.toU32();
+                            if (!(current.isNumber() or current.isBigInt())) break :parse;
+                            args.length = current.to(u52);
                             arguments.eat();
                             current = arguments.next() orelse break :parse;
 
-                            if (!current.isNumber()) break :parse;
-                            args.position = current.toU32();
+                            if (!(current.isNumber() or current.isBigInt())) break :parse;
+                            args.position = current.to(i52);
                             arguments.eat();
                         },
                     }
@@ -1484,8 +1644,8 @@ pub const Arguments = struct {
 
             if (arguments.next()) |current| {
                 arguments.eat();
-                if (current.isNumber()) {
-                    args.offset = current.toU32();
+                if (current.isNumber() or current.isBigInt()) {
+                    args.offset = current.to(u52);
 
                     if (arguments.remaining.len < 2) {
                         JSC.throwInvalidArguments(
@@ -1497,8 +1657,8 @@ pub const Arguments = struct {
 
                         return null;
                     }
-
-                    args.length = arguments.remaining[0].toU32();
+                    if (arguments.remaining[0].isNumber() or arguments.remaining[0].isBigInt())
+                        args.length = arguments.remaining[0].to(u52);
 
                     if (args.length == 0) {
                         JSC.throwInvalidArguments(
@@ -1511,26 +1671,26 @@ pub const Arguments = struct {
                         return null;
                     }
 
-                    const position: i32 = if (arguments.remaining[1].isNumber())
-                        arguments.remaining[1].toInt32()
-                    else
-                        -1;
+                    if (arguments.remaining[1].isNumber() or arguments.remaining[1].isBigInt())
+                        args.position = @intCast(ReadPosition, arguments.remaining[1].to(i52));
 
-                    args.position = if (position > -1) @intCast(ReadPosition, position) else null;
                     arguments.remaining = arguments.remaining[2..];
                 } else if (current.isObject()) {
-                    if (current.getIfPropertyExists(ctx.ptr(), "offset")) |num| {
-                        args.offset = num.toU32();
+                    if (current.getTruthy(ctx.ptr(), "offset")) |num| {
+                        if (num.isNumber() or num.isBigInt()) {
+                            args.offset = num.to(u52);
+                        }
                     }
 
-                    if (current.getIfPropertyExists(ctx.ptr(), "length")) |num| {
-                        args.length = num.toU32();
+                    if (current.getTruthy(ctx.ptr(), "length")) |num| {
+                        if (num.isNumber() or num.isBigInt()) {
+                            args.length = num.to(u52);
+                        }
                     }
 
-                    if (current.getIfPropertyExists(ctx.ptr(), "position")) |num| {
-                        const position: i32 = if (num.isEmptyOrUndefinedOrNull()) -1 else num.coerce(i32, ctx);
-                        if (position > -1) {
-                            args.position = @intCast(ReadPosition, position);
+                    if (current.getTruthy(ctx.ptr(), "position")) |num| {
+                        if (num.isNumber() or num.isBigInt()) {
+                            args.position = num.to(i52);
                         }
                     }
                 }
@@ -2264,7 +2424,7 @@ pub const Arguments = struct {
             return CopyFile{
                 .src = src,
                 .dest = dest,
-                .mode = @intToEnum(Constants.Copyfile, mode),
+                .mode = @enumFromInt(Constants.Copyfile, mode),
             };
         }
     };
@@ -2313,7 +2473,7 @@ pub const Arguments = struct {
     };
 
     pub const UnwatchFile = void;
-    pub const Watch = void;
+    pub const Watch = JSC.Node.FSWatcher.Arguments;
     pub const WatchFile = void;
     pub const Fsync = struct {
         fd: FileDescriptor,
@@ -2350,6 +2510,18 @@ pub const Arguments = struct {
     };
 };
 
+pub const StatOrNotFound = union(enum) {
+    stats: Stats,
+    not_found: void,
+
+    pub fn toJS(this: *StatOrNotFound, globalObject: *JSC.JSGlobalObject) JSC.JSValue {
+        return switch (this.*) {
+            .stats => this.stats.toJS(globalObject),
+            .not_found => JSC.JSValue.undefined,
+        };
+    }
+};
+
 const Return = struct {
     pub const Access = void;
     pub const AppendFile = void;
@@ -2368,11 +2540,12 @@ const Return = struct {
     pub const Lchmod = void;
     pub const Lchown = void;
     pub const Link = void;
-    pub const Lstat = Stats;
-    pub const Mkdir = string;
+    pub const Lstat = StatOrNotFound;
+    pub const Mkdir = bun.String;
     pub const Mkdtemp = JSC.ZigString;
     pub const Open = FileDescriptor;
     pub const WriteFile = void;
+    pub const Readv = Read;
     pub const Read = struct {
         bytes_read: u52,
 
@@ -2469,18 +2642,20 @@ const Return = struct {
     pub const RealpathNative = Realpath;
     pub const Rename = void;
     pub const Rmdir = void;
-    pub const Stat = Stats;
+    pub const Stat = StatOrNotFound;
 
     pub const Symlink = void;
     pub const Truncate = void;
     pub const Unlink = void;
     pub const UnwatchFile = void;
-    pub const Watch = void;
+    pub const Watch = JSC.JSValue;
     pub const WatchFile = void;
     pub const Utimes = void;
 
     pub const Chown = void;
     pub const Lutimes = void;
+
+    pub const Writev = Write;
 };
 
 /// Bun's implementation of the Node.js "fs" module
@@ -2493,12 +2668,13 @@ pub const NodeFS = struct {
     /// That means a stack-allocated buffer won't suffice. Instead, we re-use
     /// the heap allocated buffer on the NodefS struct
     sync_error_buf: [bun.MAX_PATH_BYTES]u8 = undefined,
+    vm: ?*JSC.VirtualMachine = null,
 
     pub const ReturnType = Return;
 
     pub fn access(this: *NodeFS, args: Arguments.Access, comptime _: Flavor) Maybe(Return.Access) {
         var path = args.path.sliceZ(&this.sync_error_buf);
-        const rc = Syscall.system.access(path, @enumToInt(args.mode));
+        const rc = Syscall.system.access(path, @intFromEnum(args.mode));
         return Maybe(Return.Access).errnoSysP(rc, .access, path) orelse Maybe(Return.Access).success;
     }
 
@@ -2528,7 +2704,7 @@ pub const NodeFS = struct {
                 const path = path_.sliceZ(&this.sync_error_buf);
                 switch (comptime flavor) {
                     .sync => {
-                        const fd = switch (Syscall.open(path, @enumToInt(FileSystemFlags.a), 0o000666)) {
+                        const fd = switch (Syscall.open(path, @intFromEnum(FileSystemFlags.a), 0o000666)) {
                             .result => |result| result,
                             .err => |err| return .{ .err = err },
                         };
@@ -2594,7 +2770,7 @@ pub const NodeFS = struct {
                         };
 
                         if (!os.S.ISREG(stat_.mode)) {
-                            return Maybe(Return.CopyFile){ .err = .{ .errno = @enumToInt(C.SystemErrno.ENOTSUP) } };
+                            return Maybe(Return.CopyFile){ .err = .{ .errno = @intFromEnum(C.SystemErrno.ENOTSUP) } };
                         }
 
                         // 64 KB is about the break-even point for clonefile() to be worth it
@@ -2723,7 +2899,7 @@ pub const NodeFS = struct {
                     };
 
                     if (!os.S.ISREG(stat_.mode)) {
-                        return Maybe(Return.CopyFile){ .err = .{ .errno = @enumToInt(C.SystemErrno.ENOTSUP) } };
+                        return Maybe(Return.CopyFile){ .err = .{ .errno = @intFromEnum(C.SystemErrno.ENOTSUP) } };
                     }
 
                     var flags: Mode = std.os.O.CREAT | std.os.O.WRONLY;
@@ -2978,8 +3154,13 @@ pub const NodeFS = struct {
                         &this.sync_error_buf,
                     ),
                 )) {
-                    .result => |result| Maybe(Return.Lstat){ .result = Return.Lstat.init(result, false) },
-                    .err => |err| Maybe(Return.Lstat){ .err = err },
+                    .result => |result| Maybe(Return.Lstat){ .result = .{ .stats = Stats.init(result, args.big_int) } },
+                    .err => |err| brk: {
+                        if (!args.throw_if_no_entry and err.getErrno() == .NOENT) {
+                            return Maybe(Return.Lstat){ .result = .{ .not_found = {} } };
+                        }
+                        break :brk Maybe(Return.Lstat){ .err = err };
+                    },
                 };
             },
             else => {},
@@ -2997,7 +3178,7 @@ pub const NodeFS = struct {
             .sync => {
                 const path = args.path.sliceZ(&this.sync_error_buf);
                 return switch (Syscall.mkdir(path, args.mode)) {
-                    .result => Maybe(Return.Mkdir){ .result = "" },
+                    .result => Maybe(Return.Mkdir){ .result = bun.String.empty },
                     .err => |err| Maybe(Return.Mkdir){ .err = err },
                 };
             },
@@ -3026,24 +3207,29 @@ pub const NodeFS = struct {
                     .err => |err| {
                         switch (err.getErrno()) {
                             else => {
-                                @memcpy(&this.sync_error_buf, path.ptr, len);
+                                @memcpy(this.sync_error_buf[0..len], path[0..len]);
                                 return .{ .err = err.withPath(this.sync_error_buf[0..len]) };
                             },
 
                             .EXIST => {
-                                return Option{ .result = "" };
+                                return Option{ .result = bun.String.empty };
                             },
                             // continue
                             .NOENT => {},
                         }
                     },
                     .result => {
-                        return Option{ .result = args.path.slice() };
+                        return Option{
+                            .result = if (args.path == .slice_with_underlying_string)
+                                args.path.slice_with_underlying_string.underlying
+                            else
+                                bun.String.create(args.path.slice()),
+                        };
                     },
                 }
 
                 var working_mem = &this.sync_error_buf;
-                @memcpy(working_mem, path.ptr, len);
+                @memcpy(working_mem[0..len], path[0..len]);
 
                 var i: u16 = len - 1;
 
@@ -3111,10 +3297,9 @@ pub const NodeFS = struct {
                         switch (err.getErrno()) {
                             // handle the race condition
                             .EXIST => {
-                                var display_path: []const u8 = "";
+                                var display_path = bun.String.empty;
                                 if (first_match != std.math.maxInt(u16)) {
-                                    // TODO: this leaks memory
-                                    display_path = bun.default_allocator.dupe(u8, display_path[0..first_match]) catch unreachable;
+                                    display_path = bun.String.create(working_mem[0..first_match]);
                                 }
                                 return Option{ .result = display_path };
                             },
@@ -3126,12 +3311,14 @@ pub const NodeFS = struct {
                         }
                     },
                     .result => {
-                        var display_path = args.path.slice();
-                        if (first_match != std.math.maxInt(u16)) {
-                            // TODO: this leaks memory
-                            display_path = bun.default_allocator.dupe(u8, display_path[0..first_match]) catch unreachable;
-                        }
-                        return Option{ .result = display_path };
+                        return Option{
+                            .result = if (first_match != std.math.maxInt(u16))
+                                bun.String.create(working_mem[0..first_match])
+                            else if (args.path == .slice_with_underlying_string)
+                                args.path.slice_with_underlying_string.underlying
+                            else
+                                bun.String.create(args.path.slice()),
+                        };
                     },
                 }
             },
@@ -3146,7 +3333,7 @@ pub const NodeFS = struct {
         const prefix_slice = args.prefix.slice();
         const len = @min(prefix_slice.len, prefix_buf.len -| 7);
         if (len > 0) {
-            @memcpy(prefix_buf, prefix_slice.ptr, len);
+            @memcpy(prefix_buf[0..len], prefix_slice[0..len]);
         }
         prefix_buf[len..][0..6].* = "XXXXXX".*;
         prefix_buf[len..][6] = 0;
@@ -3162,15 +3349,15 @@ pub const NodeFS = struct {
             };
         }
         // std.c.getErrno(rc) returns SUCCESS if rc is null so we call std.c._errno() directly
-        const errno = @intToEnum(std.c.E, std.c._errno().*);
-        return .{ .err = Syscall.Error{ .errno = @truncate(Syscall.Error.Int, @enumToInt(errno)), .syscall = .mkdtemp } };
+        const errno = @enumFromInt(std.c.E, std.c._errno().*);
+        return .{ .err = Syscall.Error{ .errno = @truncate(Syscall.Error.Int, @intFromEnum(errno)), .syscall = .mkdtemp } };
     }
     pub fn open(this: *NodeFS, args: Arguments.Open, comptime flavor: Flavor) Maybe(Return.Open) {
         switch (comptime flavor) {
             // The sync version does no allocation except when returning the path
             .sync => {
                 const path = args.path.sliceZ(&this.sync_error_buf);
-                return switch (Syscall.open(path, @enumToInt(args.flags), args.mode)) {
+                return switch (Syscall.open(path, @intFromEnum(args.flags), args.mode)) {
                     .err => |err| .{
                         .err = err.withPath(args.path.slice()),
                     },
@@ -3250,6 +3437,14 @@ pub const NodeFS = struct {
             );
     }
 
+    pub fn readv(this: *NodeFS, args: Arguments.Readv, comptime flavor: Flavor) Maybe(Return.Read) {
+        return if (args.position != null) _preadv(this, args, flavor) else _readv(this, args, flavor);
+    }
+
+    pub fn writev(this: *NodeFS, args: Arguments.Writev, comptime flavor: Flavor) Maybe(Return.Write) {
+        return if (args.position != null) _pwritev(this, args, flavor) else _writev(this, args, flavor);
+    }
+
     pub fn write(this: *NodeFS, args: Arguments.Write, comptime flavor: Flavor) Maybe(Return.Write) {
         return if (args.position != null) _pwrite(this, args, flavor) else _write(this, args, flavor);
     }
@@ -3301,6 +3496,82 @@ pub const NodeFS = struct {
         return Maybe(Return.Write).todo;
     }
 
+    fn _preadv(_: *NodeFS, args: Arguments.Readv, comptime flavor: Flavor) Maybe(Return.Readv) {
+        const position = args.position.?;
+
+        switch (comptime flavor) {
+            .sync => {
+                return switch (Syscall.preadv(args.fd, args.buffers.buffers.items, position)) {
+                    .err => |err| .{
+                        .err = err,
+                    },
+                    .result => |amt| .{ .result = .{
+                        .bytes_read = @truncate(u52, amt),
+                    } },
+                };
+            },
+            else => {},
+        }
+
+        return Maybe(Return.Write).todo;
+    }
+
+    fn _readv(_: *NodeFS, args: Arguments.Readv, comptime flavor: Flavor) Maybe(Return.Readv) {
+        switch (comptime flavor) {
+            .sync => {
+                return switch (Syscall.readv(args.fd, args.buffers.buffers.items)) {
+                    .err => |err| .{
+                        .err = err,
+                    },
+                    .result => |amt| .{ .result = .{
+                        .bytes_read = @truncate(u52, amt),
+                    } },
+                };
+            },
+            else => {},
+        }
+
+        return Maybe(Return.Write).todo;
+    }
+
+    fn _pwritev(_: *NodeFS, args: Arguments.Writev, comptime flavor: Flavor) Maybe(Return.Write) {
+        const position = args.position.?;
+
+        switch (comptime flavor) {
+            .sync => {
+                return switch (Syscall.pwritev(args.fd, args.buffers.buffers.items, position)) {
+                    .err => |err| .{
+                        .err = err,
+                    },
+                    .result => |amt| .{ .result = .{
+                        .bytes_written = @truncate(u52, amt),
+                    } },
+                };
+            },
+            else => {},
+        }
+
+        return Maybe(Return.Write).todo;
+    }
+
+    fn _writev(_: *NodeFS, args: Arguments.Writev, comptime flavor: Flavor) Maybe(Return.Write) {
+        switch (comptime flavor) {
+            .sync => {
+                return switch (Syscall.writev(args.fd, args.buffers.buffers.items)) {
+                    .err => |err| .{
+                        .err = err,
+                    },
+                    .result => |amt| .{ .result = .{
+                        .bytes_written = @truncate(u52, amt),
+                    } },
+                };
+            },
+            else => {},
+        }
+
+        return Maybe(Return.Write).todo;
+    }
+
     pub fn readdir(this: *NodeFS, args: Arguments.Readdir, comptime flavor: Flavor) Maybe(Return.Readdir) {
         return switch (args.encoding) {
             .buffer => _readdir(
@@ -3443,6 +3714,35 @@ pub const NodeFS = struct {
                 const fd = switch (args.path) {
                     .path => brk: {
                         path = args.path.path.sliceZ(&this.sync_error_buf);
+                        if (this.vm) |vm| {
+                            if (vm.standalone_module_graph) |graph| {
+                                if (graph.find(path)) |file| {
+                                    if (args.encoding == .buffer) {
+                                        return .{
+                                            .result = .{
+                                                .buffer = Buffer.fromBytes(
+                                                    bun.default_allocator.dupe(u8, file.contents) catch @panic("out of memory"),
+                                                    bun.default_allocator,
+                                                    .Uint8Array,
+                                                ),
+                                            },
+                                        };
+                                    } else if (comptime string_type == .default)
+                                        return .{
+                                            .result = .{
+                                                .string = bun.default_allocator.dupe(u8, file.contents) catch @panic("out of memory"),
+                                            },
+                                        }
+                                    else
+                                        return .{
+                                            .result = .{
+                                                .null_terminated = bun.default_allocator.dupeZ(u8, file.contents) catch @panic("out of memory"),
+                                            },
+                                        };
+                                }
+                            }
+                        }
+
                         break :brk switch (Syscall.open(
                             path,
                             os.O.RDONLY | os.O.NOCTTY,
@@ -3605,7 +3905,7 @@ pub const NodeFS = struct {
                 break :brk switch (Syscall.openat(
                     args.dirfd,
                     path,
-                    @enumToInt(args.flag) | os.O.NOCTTY,
+                    @intFromEnum(args.flag) | os.O.NOCTTY,
                     args.mode,
                 )) {
                     .err => |err| return .{
@@ -3672,7 +3972,7 @@ pub const NodeFS = struct {
         }
 
         // https://github.com/oven-sh/bun/issues/2931
-        if ((@enumToInt(args.flag) & std.os.O.APPEND) == 0) {
+        if ((@intFromEnum(args.flag) & std.os.O.APPEND) == 0) {
             _ = ftruncateSync(.{ .fd = fd, .len = @truncate(JSC.WebCore.Blob.SizeType, written) });
         }
 
@@ -3819,12 +4119,12 @@ pub const NodeFS = struct {
 
                         while (true) {
                             if (Maybe(Return.Rmdir).errnoSys(bun.C.darwin.removefileat(std.os.AT.FDCWD, dest, null, flags), .rmdir)) |errno| {
-                                switch (@intToEnum(os.E, errno.err.errno)) {
+                                switch (@enumFromInt(os.E, errno.err.errno)) {
                                     .AGAIN, .INTR => continue,
                                     .NOENT => return Maybe(Return.Rmdir).success,
                                     .MLINK => {
                                         var copy: [bun.MAX_PATH_BYTES]u8 = undefined;
-                                        @memcpy(&copy, dest.ptr, dest.len);
+                                        @memcpy(copy[0..dest.len], dest);
                                         copy[dest.len] = 0;
                                         var dest_copy = copy[0..dest.len :0];
                                         switch (Syscall.unlink(dest_copy).getErrno()) {
@@ -3906,7 +4206,7 @@ pub const NodeFS = struct {
                         }
 
                         if (Maybe(Return.Rm).errnoSys(bun.C.darwin.removefileat(std.os.AT.FDCWD, dest, null, flags), .unlink)) |errno| {
-                            switch (@intToEnum(os.E, errno.err.errno)) {
+                            switch (@enumFromInt(os.E, errno.err.errno)) {
                                 .AGAIN, .INTR => continue,
                                 .NOENT => {
                                     if (args.force) {
@@ -3918,7 +4218,7 @@ pub const NodeFS = struct {
 
                                 .MLINK => {
                                     var copy: [bun.MAX_PATH_BYTES]u8 = undefined;
-                                    @memcpy(&copy, dest.ptr, dest.len);
+                                    @memcpy(copy[0..dest.len], dest);
                                     copy[dest.len] = 0;
                                     var dest_copy = copy[0..dest.len :0];
                                     switch (Syscall.unlink(dest_copy).getErrno()) {
@@ -4063,8 +4363,13 @@ pub const NodeFS = struct {
                         &this.sync_error_buf,
                     ),
                 )) {
-                    .result => |result| Maybe(Return.Stat){ .result = Return.Stat.init(result, false) },
-                    .err => |err| Maybe(Return.Stat){ .err = err },
+                    .result => |result| Maybe(Return.Stat){ .result = .{ .stats = Stats.init(result, args.big_int) } },
+                    .err => |err| brk: {
+                        if (!args.throw_if_no_entry and err.getErrno() == .NOENT) {
+                            return Maybe(Return.Stat){ .result = .{ .not_found = {} } };
+                        }
+                        break :brk Maybe(Return.Stat){ .err = err };
+                    },
                 });
             },
             else => {},
@@ -4181,8 +4486,12 @@ pub const NodeFS = struct {
 
         return Maybe(Return.Lutimes).todo;
     }
-    pub fn watch(_: *NodeFS, _: Arguments.Watch, comptime _: Flavor) Maybe(Return.Watch) {
-        return Maybe(Return.Watch).todo;
+    pub fn watch(_: *NodeFS, args: Arguments.Watch, comptime _: Flavor) Maybe(Return.Watch) {
+        const watcher = args.createFSWatcher() catch |err| {
+            args.global_this.throwError(err, "Failed to watch filename");
+            return Maybe(Return.Watch){ .result = JSC.JSValue.jsUndefined() };
+        };
+        return Maybe(Return.Watch){ .result = watcher };
     }
     pub fn createReadStream(_: *NodeFS, _: Arguments.CreateReadStream, comptime _: Flavor) Maybe(Return.CreateReadStream) {
         return Maybe(Return.CreateReadStream).todo;
diff --git a/src/bun.js/node/node_fs_binding.zig b/src/bun.js/node/node_fs_binding.zig
index 74b769bf6..a4cc62cd3 100644
--- a/src/bun.js/node/node_fs_binding.zig
+++ b/src/bun.js/node/node_fs_binding.zig
@@ -229,6 +229,10 @@ pub const NodeJSFS = struct {
     pub const lutimesSync = callSync(.lutimes);
     pub const rmSync = callSync(.rm);
     pub const rmdirSync = callSync(.rmdir);
+    pub const writev = call(.writev);
+    pub const writevSync = callSync(.writev);
+    pub const readv = call(.readv);
+    pub const readvSync = callSync(.readv);
 
     pub const fdatasyncSync = callSync(.fdatasync);
     pub const fdatasync = call(.fdatasync);
@@ -241,12 +245,10 @@ pub const NodeJSFS = struct {
         return JSC.Node.Stats.getConstructor(globalThis);
     }
 
+    pub const watch = callSync(.watch);
+
     // Not implemented yet:
     const notimpl = fdatasync;
     pub const opendir = notimpl;
     pub const opendirSync = notimpl;
-    pub const readv = notimpl;
-    pub const readvSync = notimpl;
-    pub const writev = notimpl;
-    pub const writevSync = notimpl;
 };
diff --git a/src/bun.js/node/node_fs_constant.zig b/src/bun.js/node/node_fs_constant.zig
index 378f332c6..8e642ebad 100644
--- a/src/bun.js/node/node_fs_constant.zig
+++ b/src/bun.js/node/node_fs_constant.zig
@@ -26,17 +26,17 @@ pub const Constants = struct {
         pub const force = 4;
 
         pub inline fn isForceClone(this: Copyfile) bool {
-            return (@enumToInt(this) & COPYFILE_FICLONE_FORCE) != 0;
+            return (@intFromEnum(this) & COPYFILE_FICLONE_FORCE) != 0;
         }
 
         pub inline fn shouldntOverwrite(this: Copyfile) bool {
-            return (@enumToInt(this) & COPYFILE_EXCL) != 0;
+            return (@intFromEnum(this) & COPYFILE_EXCL) != 0;
         }
 
         pub inline fn canUseClone(this: Copyfile) bool {
             _ = this;
             return Environment.isMac;
-            // return (@enumToInt(this) | COPYFILE_FICLONE) != 0;
+            // return (@intFromEnum(this) | COPYFILE_FICLONE) != 0;
         }
     };
 
diff --git a/src/bun.js/node/node_fs_watcher.zig b/src/bun.js/node/node_fs_watcher.zig
new file mode 100644
index 000000000..d0af350c0
--- /dev/null
+++ b/src/bun.js/node/node_fs_watcher.zig
@@ -0,0 +1,919 @@
+const std = @import("std");
+const JSC = @import("root").bun.JSC;
+const bun = @import("root").bun;
+const Fs = @import("../../fs.zig");
+const Path = @import("../../resolver/resolve_path.zig");
+const Encoder = JSC.WebCore.Encoder;
+const Mutex = @import("../../lock.zig").Lock;
+
+const FSEvents = @import("./fs_events.zig");
+
+const VirtualMachine = JSC.VirtualMachine;
+const EventLoop = JSC.EventLoop;
+const PathLike = JSC.Node.PathLike;
+const ArgumentsSlice = JSC.Node.ArgumentsSlice;
+const Output = bun.Output;
+const string = bun.string;
+const StoredFileDescriptorType = bun.StoredFileDescriptorType;
+const Environment = bun.Environment;
+
+pub const FSWatcher = struct {
+    const watcher = @import("../../watcher.zig");
+    const options = @import("../../options.zig");
+    pub const Watcher = watcher.NewWatcher(*FSWatcher);
+    const log = Output.scoped(.FSWatcher, false);
+
+    pub const ChangeEvent = struct {
+        hash: Watcher.HashType = 0,
+        event_type: FSWatchTask.EventType = .change,
+        time_stamp: i64 = 0,
+    };
+
+    onAccept: std.ArrayHashMapUnmanaged(FSWatcher.Watcher.HashType, bun.BabyList(OnAcceptCallback), bun.ArrayIdentityContext, false) = .{},
+    ctx: *VirtualMachine,
+    verbose: bool = false,
+    file_paths: bun.BabyList(string) = .{},
+    entry_path: ?string = null,
+    entry_dir: string = "",
+    last_change_event: ChangeEvent = .{},
+
+    // JSObject
+    mutex: Mutex,
+    signal: ?*JSC.AbortSignal,
+    persistent: bool,
+    default_watcher: ?*FSWatcher.Watcher,
+    fsevents_watcher: ?*FSEvents.FSEventsWatcher,
+    poll_ref: JSC.PollRef = .{},
+    globalThis: *JSC.JSGlobalObject,
+    js_this: JSC.JSValue,
+    encoding: JSC.Node.Encoding,
+    // user can call close and pre-detach so we need to track this
+    closed: bool,
+    // counts pending tasks so we only deinit after all tasks are done
+    task_count: u32,
+    has_pending_activity: std.atomic.Atomic(bool),
+    pub usingnamespace JSC.Codegen.JSFSWatcher;
+
+    pub fn eventLoop(this: FSWatcher) *EventLoop {
+        return this.ctx.eventLoop();
+    }
+
+    pub fn enqueueTaskConcurrent(this: FSWatcher, task: *JSC.ConcurrentTask) void {
+        this.eventLoop().enqueueTaskConcurrent(task);
+    }
+
+    pub fn deinit(this: *FSWatcher) void {
+        // stop all managers and signals
+        this.detach();
+
+        while (this.file_paths.popOrNull()) |file_path| {
+            bun.default_allocator.destroy(file_path);
+        }
+        this.file_paths.deinitWithAllocator(bun.default_allocator);
+        if (this.entry_path) |path| {
+            this.entry_path = null;
+            bun.default_allocator.destroy(path);
+        }
+        bun.default_allocator.destroy(this);
+    }
+
+    pub const FSWatchTask = struct {
+        ctx: *FSWatcher,
+        count: u8 = 0,
+
+        entries: [8]Entry = undefined,
+        concurrent_task: JSC.ConcurrentTask = undefined,
+
+        pub const EventType = enum {
+            rename,
+            change,
+            @"error",
+            abort,
+        };
+
+        pub const EventFreeType = enum {
+            destroy,
+            free,
+            none,
+        };
+
+        pub const Entry = struct {
+            file_path: string,
+            event_type: EventType,
+            free_type: EventFreeType,
+        };
+
+        pub fn append(this: *FSWatchTask, file_path: string, event_type: EventType, free_type: EventFreeType) void {
+            if (this.count == 8) {
+                this.enqueue();
+                var ctx = this.ctx;
+                this.* = .{
+                    .ctx = ctx,
+                    .count = 0,
+                };
+            }
+
+            this.entries[this.count] = .{
+                .file_path = file_path,
+                .event_type = event_type,
+                .free_type = free_type,
+            };
+            this.count += 1;
+        }
+
+        pub fn run(this: *FSWatchTask) void {
+            // this runs on JS Context Thread
+
+            for (this.entries[0..this.count]) |entry| {
+                switch (entry.event_type) {
+                    .rename => {
+                        this.ctx.emit(entry.file_path, "rename");
+                    },
+                    .change => {
+                        this.ctx.emit(entry.file_path, "change");
+                    },
+                    .@"error" => {
+                        // file_path is the error message in this case
+                        this.ctx.emitError(entry.file_path);
+                    },
+                    .abort => {
+                        this.ctx.emitIfAborted();
+                    },
+                }
+            }
+
+            this.ctx.unrefTask();
+        }
+
+        pub fn enqueue(this: *FSWatchTask) void {
+            if (this.count == 0)
+                return;
+
+            // if false is closed or detached (can still contain valid refs but will not create a new one)
+            if (this.ctx.refTask()) {
+                var that = bun.default_allocator.create(FSWatchTask) catch unreachable;
+
+                that.* = this.*;
+                this.count = 0;
+                that.concurrent_task.task = JSC.Task.init(that);
+                this.ctx.enqueueTaskConcurrent(&that.concurrent_task);
+                return;
+            }
+            // closed or detached so just cleanEntries
+            this.cleanEntries();
+        }
+        pub fn cleanEntries(this: *FSWatchTask) void {
+            while (this.count > 0) {
+                this.count -= 1;
+                switch (this.entries[this.count].free_type) {
+                    .destroy => bun.default_allocator.destroy(this.entries[this.count].file_path),
+                    .free => bun.default_allocator.free(this.entries[this.count].file_path),
+                    else => {},
+                }
+            }
+        }
+
+        pub fn deinit(this: *FSWatchTask) void {
+            this.cleanEntries();
+            bun.default_allocator.destroy(this);
+        }
+    };
+
+    fn NewCallback(comptime FunctionSignature: type) type {
+        return union(enum) {
+            javascript_callback: JSC.Strong,
+            zig_callback: struct {
+                ptr: *anyopaque,
+                function: *const FunctionSignature,
+            },
+        };
+    }
+
+    pub const OnAcceptCallback = NewCallback(fn (
+        vm: *JSC.VirtualMachine,
+        specifier: []const u8,
+    ) void);
+
+    fn addDirectory(ctx: *FSWatcher, fs_watcher: *FSWatcher.Watcher, fd: StoredFileDescriptorType, file_path: string, recursive: bool, buf: *[bun.MAX_PATH_BYTES + 1]u8, is_entry_path: bool) !void {
+        var dir_path_clone = bun.default_allocator.dupeZ(u8, file_path) catch unreachable;
+
+        if (is_entry_path) {
+            ctx.entry_path = dir_path_clone;
+            ctx.entry_dir = dir_path_clone;
+        } else {
+            ctx.file_paths.push(bun.default_allocator, dir_path_clone) catch unreachable;
+        }
+        fs_watcher.addDirectory(fd, dir_path_clone, FSWatcher.Watcher.getHash(file_path), false) catch |err| {
+            ctx.deinit();
+            fs_watcher.deinit(true);
+            return err;
+        };
+
+        var iter = (std.fs.IterableDir{ .dir = std.fs.Dir{
+            .fd = fd,
+        } }).iterate();
+
+        while (iter.next() catch |err| {
+            ctx.deinit();
+            fs_watcher.deinit(true);
+            return err;
+        }) |entry| {
+            var parts = [2]string{ dir_path_clone, entry.name };
+            var entry_path = Path.joinAbsStringBuf(
+                Fs.FileSystem.instance.topLevelDirWithoutTrailingSlash(),
+                buf,
+                &parts,
+                .auto,
+            );
+
+            buf[entry_path.len] = 0;
+            var entry_path_z = buf[0..entry_path.len :0];
+
+            var fs_info = fdFromAbsolutePathZ(entry_path_z) catch |err| {
+                ctx.deinit();
+                fs_watcher.deinit(true);
+                return err;
+            };
+
+            if (fs_info.is_file) {
+                const file_path_clone = bun.default_allocator.dupeZ(u8, entry_path) catch unreachable;
+
+                ctx.file_paths.push(bun.default_allocator, file_path_clone) catch unreachable;
+
+                fs_watcher.addFile(fs_info.fd, file_path_clone, FSWatcher.Watcher.getHash(entry_path), options.Loader.file, 0, null, false) catch |err| {
+                    ctx.deinit();
+                    fs_watcher.deinit(true);
+                    return err;
+                };
+            } else {
+                if (recursive) {
+                    addDirectory(ctx, fs_watcher, fs_info.fd, entry_path, recursive, buf, false) catch |err| {
+                        ctx.deinit();
+                        fs_watcher.deinit(true);
+                        return err;
+                    };
+                }
+            }
+        }
+    }
+
+    pub fn onError(
+        this: *FSWatcher,
+        err: anyerror,
+    ) void {
+        var current_task: FSWatchTask = .{
+            .ctx = this,
+        };
+        current_task.append(@errorName(err), .@"error", .none);
+        current_task.enqueue();
+    }
+
+    pub fn onFSEventUpdate(
+        ctx: ?*anyopaque,
+        path: string,
+        _: bool,
+        is_rename: bool,
+    ) void {
+        const this = bun.cast(*FSWatcher, ctx.?);
+
+        var current_task: FSWatchTask = .{
+            .ctx = this,
+        };
+        defer current_task.enqueue();
+
+        const relative_path = bun.default_allocator.dupe(u8, path) catch unreachable;
+        const event_type: FSWatchTask.EventType = if (is_rename) .rename else .change;
+
+        current_task.append(relative_path, event_type, .destroy);
+    }
+
+    pub fn onFileUpdate(
+        this: *FSWatcher,
+        events: []watcher.WatchEvent,
+        changed_files: []?[:0]u8,
+        watchlist: watcher.Watchlist,
+    ) void {
+        var slice = watchlist.slice();
+        const file_paths = slice.items(.file_path);
+
+        var counts = slice.items(.count);
+        const kinds = slice.items(.kind);
+        var _on_file_update_path_buf: [bun.MAX_PATH_BYTES]u8 = undefined;
+
+        var ctx = this.default_watcher.?;
+        defer ctx.flushEvictions();
+        defer Output.flush();
+
+        var bundler = if (@TypeOf(this.ctx.bundler) == *bun.Bundler)
+            this.ctx.bundler
+        else
+            &this.ctx.bundler;
+
+        var fs: *Fs.FileSystem = bundler.fs;
+
+        var current_task: FSWatchTask = .{
+            .ctx = this,
+        };
+        defer current_task.enqueue();
+
+        const time_stamp = std.time.milliTimestamp();
+        const time_diff = time_stamp - this.last_change_event.time_stamp;
+
+        for (events) |event| {
+            const file_path = file_paths[event.index];
+            const update_count = counts[event.index] + 1;
+            counts[event.index] = update_count;
+            const kind = kinds[event.index];
+
+            if (comptime Environment.isDebug) {
+                if (this.verbose) {
+                    Output.prettyErrorln("[watch] {s} ({s}, {})", .{ file_path, @tagName(kind), event.op });
+                }
+            }
+
+            switch (kind) {
+                .file => {
+                    if (event.op.delete) {
+                        ctx.removeAtIndex(
+                            event.index,
+                            0,
+                            &.{},
+                            .file,
+                        );
+                    }
+
+                    var file_hash: FSWatcher.Watcher.HashType = FSWatcher.Watcher.getHash(file_path);
+
+                    if (event.op.write or event.op.delete or event.op.rename) {
+                        const event_type: FSWatchTask.EventType = if (event.op.delete or event.op.rename or event.op.move_to) .rename else .change;
+                        // skip consecutive duplicates
+                        if ((this.last_change_event.time_stamp == 0 or time_diff > 1) or this.last_change_event.event_type != event_type and this.last_change_event.hash != file_hash) {
+                            this.last_change_event.time_stamp = time_stamp;
+                            this.last_change_event.event_type = event_type;
+                            this.last_change_event.hash = file_hash;
+
+                            const relative_slice = fs.relative(this.entry_dir, file_path);
+
+                            if (this.verbose)
+                                Output.prettyErrorln("<r><d>File changed: {s}<r>", .{relative_slice});
+
+                            const relative_path = bun.default_allocator.dupe(u8, relative_slice) catch unreachable;
+
+                            current_task.append(relative_path, event_type, .destroy);
+                        }
+                    }
+                },
+                .directory => {
+                    // macOS should use FSEvents for directories
+                    if (comptime Environment.isMac) {
+                        @panic("Unexpected directory watch");
+                    }
+
+                    const affected = event.names(changed_files);
+
+                    for (affected) |changed_name_| {
+                        const changed_name: []const u8 = bun.asByteSlice(changed_name_.?);
+                        if (changed_name.len == 0 or changed_name[0] == '~' or changed_name[0] == '.') continue;
+
+                        var file_hash: FSWatcher.Watcher.HashType = 0;
+                        const relative_slice: string = brk: {
+                            var file_path_without_trailing_slash = std.mem.trimRight(u8, file_path, std.fs.path.sep_str);
+
+                            @memcpy(_on_file_update_path_buf[0..file_path_without_trailing_slash.len], file_path_without_trailing_slash);
+
+                            _on_file_update_path_buf[file_path_without_trailing_slash.len] = std.fs.path.sep;
+
+                            @memcpy(_on_file_update_path_buf[file_path_without_trailing_slash.len + 1 ..][0..changed_name.len], changed_name);
+                            const path_slice = _on_file_update_path_buf[0 .. file_path_without_trailing_slash.len + changed_name.len + 1];
+                            file_hash = FSWatcher.Watcher.getHash(path_slice);
+
+                            const relative = fs.relative(this.entry_dir, path_slice);
+
+                            break :brk relative;
+                        };
+
+                        // skip consecutive duplicates
+                        const event_type: FSWatchTask.EventType = .rename; // renaming folders, creating folder or files will be always be rename
+                        if ((this.last_change_event.time_stamp == 0 or time_diff > 1) or this.last_change_event.event_type != event_type and this.last_change_event.hash != file_hash) {
+                            const relative_path = bun.default_allocator.dupe(u8, relative_slice) catch unreachable;
+
+                            this.last_change_event.time_stamp = time_stamp;
+                            this.last_change_event.event_type = event_type;
+                            this.last_change_event.hash = file_hash;
+
+                            current_task.append(relative_path, event_type, .destroy);
+
+                            if (this.verbose)
+                                Output.prettyErrorln("<r> <d>Dir change: {s}<r>", .{relative_path});
+                        }
+                    }
+
+                    if (this.verbose and affected.len == 0) {
+                        Output.prettyErrorln("<r> <d>Dir change: {s}<r>", .{fs.relative(this.entry_dir, file_path)});
+                    }
+                },
+            }
+        }
+    }
+
+    pub const Arguments = struct {
+        path: PathLike,
+        listener: JSC.JSValue,
+        global_this: JSC.C.JSContextRef,
+        signal: ?*JSC.AbortSignal,
+        persistent: bool,
+        recursive: bool,
+        encoding: JSC.Node.Encoding,
+        verbose: bool,
+        pub fn fromJS(ctx: JSC.C.JSContextRef, arguments: *ArgumentsSlice, exception: JSC.C.ExceptionRef) ?Arguments {
+            const vm = ctx.vm();
+            const path = PathLike.fromJS(ctx, arguments, exception) orelse {
+                if (exception.* == null) {
+                    JSC.throwInvalidArguments(
+                        "filename must be a string or TypedArray",
+                        .{},
+                        ctx,
+                        exception,
+                    );
+                }
+                return null;
+            };
+
+            if (exception.* != null) return null;
+            var listener: JSC.JSValue = .zero;
+            var signal: ?*JSC.AbortSignal = null;
+            var persistent: bool = true;
+            var recursive: bool = false;
+            var encoding: JSC.Node.Encoding = .utf8;
+            var verbose = false;
+            if (arguments.nextEat()) |options_or_callable| {
+
+                // options
+                if (options_or_callable.isObject()) {
+                    if (options_or_callable.get(ctx, "persistent")) |persistent_| {
+                        if (!persistent_.isBoolean()) {
+                            JSC.throwInvalidArguments(
+                                "persistent must be a boolean.",
+                                .{},
+                                ctx,
+                                exception,
+                            );
+                            return null;
+                        }
+                        persistent = persistent_.toBoolean();
+                    }
+
+                    if (options_or_callable.get(ctx, "verbose")) |verbose_| {
+                        if (!verbose_.isBoolean()) {
+                            JSC.throwInvalidArguments(
+                                "verbose must be a boolean.",
+                                .{},
+                                ctx,
+                                exception,
+                            );
+                            return null;
+                        }
+                        verbose = verbose_.toBoolean();
+                    }
+
+                    if (options_or_callable.get(ctx, "encoding")) |encoding_| {
+                        if (!encoding_.isString()) {
+                            JSC.throwInvalidArguments(
+                                "encoding must be a string.",
+                                .{},
+                                ctx,
+                                exception,
+                            );
+                            return null;
+                        }
+                        if (JSC.Node.Encoding.fromJS(encoding_, ctx.ptr())) |node_encoding| {
+                            encoding = node_encoding;
+                        } else {
+                            JSC.throwInvalidArguments(
+                                "invalid encoding.",
+                                .{},
+                                ctx,
+                                exception,
+                            );
+                            return null;
+                        }
+                    }
+
+                    if (options_or_callable.get(ctx, "recursive")) |recursive_| {
+                        if (!recursive_.isBoolean()) {
+                            JSC.throwInvalidArguments(
+                                "recursive must be a boolean.",
+                                .{},
+                                ctx,
+                                exception,
+                            );
+                            return null;
+                        }
+                        recursive = recursive_.toBoolean();
+                    }
+
+                    // abort signal
+                    if (options_or_callable.get(ctx, "signal")) |signal_| {
+                        if (JSC.AbortSignal.fromJS(signal_)) |signal_obj| {
+                            //Keep it alive
+                            signal_.ensureStillAlive();
+                            signal = signal_obj;
+                        } else {
+                            JSC.throwInvalidArguments(
+                                "signal is not of type AbortSignal.",
+                                .{},
+                                ctx,
+                                exception,
+                            );
+
+                            return null;
+                        }
+                    }
+
+                    // listener
+                    if (arguments.nextEat()) |callable| {
+                        if (!callable.isCell() or !callable.isCallable(vm)) {
+                            exception.* = JSC.toInvalidArguments("Expected \"listener\" callback to be a function", .{}, ctx).asObjectRef();
+                            return null;
+                        }
+                        listener = callable;
+                    }
+                } else {
+                    if (!options_or_callable.isCell() or !options_or_callable.isCallable(vm)) {
+                        exception.* = JSC.toInvalidArguments("Expected \"listener\" callback to be a function", .{}, ctx).asObjectRef();
+                        return null;
+                    }
+                    listener = options_or_callable;
+                }
+            }
+            if (listener == .zero) {
+                exception.* = JSC.toInvalidArguments("Expected \"listener\" callback", .{}, ctx).asObjectRef();
+                return null;
+            }
+
+            return Arguments{
+                .path = path,
+                .listener = listener,
+                .global_this = ctx,
+                .signal = signal,
+                .persistent = persistent,
+                .recursive = recursive,
+                .encoding = encoding,
+                .verbose = verbose,
+            };
+        }
+
+        pub fn createFSWatcher(this: Arguments) !JSC.JSValue {
+            const obj = try FSWatcher.init(this);
+            if (obj.js_this != .zero) {
+                return obj.js_this;
+            }
+            return JSC.JSValue.jsUndefined();
+        }
+    };
+
+    pub fn initJS(this: *FSWatcher, listener: JSC.JSValue) void {
+        if (this.persistent) {
+            this.poll_ref.ref(this.ctx);
+        }
+
+        const js_this = FSWatcher.toJS(this, this.globalThis);
+        js_this.ensureStillAlive();
+        this.js_this = js_this;
+        FSWatcher.listenerSetCached(js_this, this.globalThis, listener);
+
+        if (this.signal) |s| {
+            // already aborted?
+            if (s.aborted()) {
+                // safely abort next tick
+                var current_task: FSWatchTask = .{
+                    .ctx = this,
+                };
+                current_task.append("", .abort, .none);
+                current_task.enqueue();
+            } else {
+                // watch for abortion
+                this.signal = s.listen(FSWatcher, this, FSWatcher.emitAbort);
+            }
+        }
+    }
+
+    pub fn emitIfAborted(this: *FSWatcher) void {
+        if (this.signal) |s| {
+            if (s.aborted()) {
+                const err = s.abortReason();
+                this.emitAbort(err);
+            }
+        }
+    }
+
+    pub fn emitAbort(this: *FSWatcher, err: JSC.JSValue) void {
+        if (this.closed) return;
+        defer this.close();
+
+        err.ensureStillAlive();
+        if (this.js_this != .zero) {
+            const js_this = this.js_this;
+            js_this.ensureStillAlive();
+            if (FSWatcher.listenerGetCached(js_this)) |listener| {
+                listener.ensureStillAlive();
+                var args = [_]JSC.JSValue{
+                    JSC.ZigString.static("error").toValue(this.globalThis),
+                    if (err.isEmptyOrUndefinedOrNull()) JSC.WebCore.AbortSignal.createAbortError(JSC.ZigString.static("The user aborted a request"), &JSC.ZigString.Empty, this.globalThis) else err,
+                };
+                _ = listener.callWithGlobalThis(
+                    this.globalThis,
+                    &args,
+                );
+            }
+        }
+    }
+    pub fn emitError(this: *FSWatcher, err: string) void {
+        if (this.closed) return;
+        defer this.close();
+
+        if (this.js_this != .zero) {
+            const js_this = this.js_this;
+            js_this.ensureStillAlive();
+            if (FSWatcher.listenerGetCached(js_this)) |listener| {
+                listener.ensureStillAlive();
+                var args = [_]JSC.JSValue{
+                    JSC.ZigString.static("error").toValue(this.globalThis),
+                    JSC.ZigString.fromUTF8(err).toErrorInstance(this.globalThis),
+                };
+                _ = listener.callWithGlobalThis(
+                    this.globalThis,
+                    &args,
+                );
+            }
+        }
+    }
+
+    pub fn emit(this: *FSWatcher, file_name: string, comptime eventType: string) void {
+        if (this.js_this != .zero) {
+            const js_this = this.js_this;
+            js_this.ensureStillAlive();
+            if (FSWatcher.listenerGetCached(js_this)) |listener| {
+                listener.ensureStillAlive();
+                var filename: JSC.JSValue = JSC.JSValue.jsUndefined();
+                if (file_name.len > 0) {
+                    if (this.encoding == .buffer)
+                        filename = JSC.ArrayBuffer.createBuffer(this.globalThis, file_name)
+                    else if (this.encoding == .utf8) {
+                        filename = JSC.ZigString.fromUTF8(file_name).toValueGC(this.globalThis);
+                    } else {
+                        // convert to desired encoding
+                        filename = Encoder.toStringAtRuntime(file_name.ptr, file_name.len, this.globalThis, this.encoding);
+                    }
+                }
+                var args = [_]JSC.JSValue{
+                    JSC.ZigString.static(eventType).toValue(this.globalThis),
+                    filename,
+                };
+                _ = listener.callWithGlobalThis(
+                    this.globalThis,
+                    &args,
+                );
+            }
+        }
+    }
+
+    pub fn doRef(this: *FSWatcher, _: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        if (!this.closed and !this.persistent) {
+            this.persistent = true;
+            this.poll_ref.ref(this.ctx);
+        }
+        return JSC.JSValue.jsUndefined();
+    }
+
+    pub fn doUnref(this: *FSWatcher, _: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        if (this.persistent) {
+            this.persistent = false;
+            this.poll_ref.unref(this.ctx);
+        }
+        return JSC.JSValue.jsUndefined();
+    }
+
+    pub fn hasRef(this: *FSWatcher, _: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        return JSC.JSValue.jsBoolean(this.persistent);
+    }
+
+    // this can be called from Watcher Thread or JS Context Thread
+    pub fn refTask(this: *FSWatcher) bool {
+        this.mutex.lock();
+        defer this.mutex.unlock();
+        // stop new references
+        if (this.closed) return false;
+        this.task_count += 1;
+        return true;
+    }
+
+    pub fn hasPendingActivity(this: *FSWatcher) callconv(.C) bool {
+        @fence(.Acquire);
+        return this.has_pending_activity.load(.Acquire);
+    }
+    // only called from Main Thread
+    pub fn updateHasPendingActivity(this: *FSWatcher) void {
+        @fence(.Release);
+        this.has_pending_activity.store(false, .Release);
+    }
+
+    // unref is always called on main JS Context Thread
+    pub fn unrefTask(this: *FSWatcher) void {
+        this.mutex.lock();
+        defer this.mutex.unlock();
+        this.task_count -= 1;
+        if (this.closed and this.task_count == 0) {
+            this.updateHasPendingActivity();
+        }
+    }
+
+    pub fn close(
+        this: *FSWatcher,
+    ) void {
+        this.mutex.lock();
+        if (!this.closed) {
+            this.closed = true;
+
+            // emit should only be called unlocked
+            this.mutex.unlock();
+
+            this.emit("", "close");
+            // we immediately detach here
+            this.detach();
+
+            // no need to lock again, because ref checks closed and unref is only called on main thread
+            if (this.task_count == 0) {
+                this.updateHasPendingActivity();
+            }
+        } else {
+            this.mutex.unlock();
+        }
+    }
+
+    // this can be called multiple times
+    pub fn detach(this: *FSWatcher) void {
+        if (this.persistent) {
+            this.persistent = false;
+            this.poll_ref.unref(this.ctx);
+        }
+
+        if (this.signal) |signal| {
+            this.signal = null;
+            signal.detach(this);
+        }
+
+        if (this.default_watcher) |default_watcher| {
+            this.default_watcher = null;
+            default_watcher.deinit(true);
+        }
+
+        if (this.fsevents_watcher) |fsevents_watcher| {
+            this.fsevents_watcher = null;
+            fsevents_watcher.deinit();
+        }
+
+        this.js_this = .zero;
+    }
+
+    pub fn doClose(this: *FSWatcher, _: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        this.close();
+        return JSC.JSValue.jsUndefined();
+    }
+
+    pub fn finalize(this: *FSWatcher) callconv(.C) void {
+        this.deinit();
+    }
+
+    const PathResult = struct {
+        fd: StoredFileDescriptorType = 0,
+        is_file: bool = true,
+    };
+
+    fn fdFromAbsolutePathZ(
+        absolute_path_z: [:0]const u8,
+    ) !PathResult {
+        var stat = try bun.C.lstat_absolute(absolute_path_z);
+        var result = PathResult{};
+
+        switch (stat.kind) {
+            .sym_link => {
+                var file = try std.fs.openFileAbsoluteZ(absolute_path_z, .{ .mode = .read_only });
+                result.fd = file.handle;
+                const _stat = try file.stat();
+
+                result.is_file = _stat.kind != .directory;
+            },
+            .directory => {
+                const dir = (try std.fs.openIterableDirAbsoluteZ(absolute_path_z, .{
+                    .access_sub_paths = true,
+                })).dir;
+                result.fd = dir.fd;
+                result.is_file = false;
+            },
+            else => {
+                const file = try std.fs.openFileAbsoluteZ(absolute_path_z, .{ .mode = .read_only });
+                result.fd = file.handle;
+                result.is_file = true;
+            },
+        }
+        return result;
+    }
+
+    pub fn init(args: Arguments) !*FSWatcher {
+        var buf: [bun.MAX_PATH_BYTES + 1]u8 = undefined;
+        var slice = args.path.slice();
+        if (bun.strings.startsWith(slice, "file://")) {
+            slice = slice[6..];
+        }
+        var parts = [_]string{
+            slice,
+        };
+
+        var file_path = Path.joinAbsStringBuf(
+            Fs.FileSystem.instance.top_level_dir,
+            &buf,
+            &parts,
+            .auto,
+        );
+
+        buf[file_path.len] = 0;
+        var file_path_z = buf[0..file_path.len :0];
+
+        var fs_type = try fdFromAbsolutePathZ(file_path_z);
+
+        var ctx = try bun.default_allocator.create(FSWatcher);
+        const vm = args.global_this.bunVM();
+        ctx.* = .{
+            .ctx = vm,
+            .mutex = Mutex.init(),
+            .signal = if (args.signal) |s| s.ref() else null,
+            .persistent = args.persistent,
+            .default_watcher = null,
+            .fsevents_watcher = null,
+            .globalThis = args.global_this,
+            .js_this = .zero,
+            .encoding = args.encoding,
+            .closed = false,
+            .task_count = 0,
+            .has_pending_activity = std.atomic.Atomic(bool).init(true),
+            .verbose = args.verbose,
+            .file_paths = bun.BabyList(string).initCapacity(bun.default_allocator, 1) catch |err| {
+                ctx.deinit();
+                return err;
+            },
+        };
+
+        if (comptime Environment.isMac) {
+            if (!fs_type.is_file) {
+                var dir_path_clone = bun.default_allocator.dupeZ(u8, file_path) catch unreachable;
+                ctx.entry_path = dir_path_clone;
+                ctx.entry_dir = dir_path_clone;
+
+                ctx.fsevents_watcher = FSEvents.watch(dir_path_clone, args.recursive, onFSEventUpdate, bun.cast(*anyopaque, ctx)) catch |err| {
+                    ctx.deinit();
+                    return err;
+                };
+
+                ctx.initJS(args.listener);
+                return ctx;
+            }
+        }
+
+        var default_watcher = FSWatcher.Watcher.init(
+            ctx,
+            vm.bundler.fs,
+            bun.default_allocator,
+        ) catch |err| {
+            ctx.deinit();
+            return err;
+        };
+
+        ctx.default_watcher = default_watcher;
+
+        if (fs_type.is_file) {
+            var file_path_clone = bun.default_allocator.dupeZ(u8, file_path) catch unreachable;
+
+            ctx.entry_path = file_path_clone;
+            ctx.entry_dir = std.fs.path.dirname(file_path_clone) orelse file_path_clone;
+
+            default_watcher.addFile(fs_type.fd, file_path_clone, FSWatcher.Watcher.getHash(file_path), options.Loader.file, 0, null, false) catch |err| {
+                ctx.deinit();
+                return err;
+            };
+        } else {
+            addDirectory(ctx, default_watcher, fs_type.fd, file_path, args.recursive, &buf, true) catch |err| {
+                ctx.deinit();
+                return err;
+            };
+        }
+
+        default_watcher.start() catch |err| {
+            ctx.deinit();
+            return err;
+        };
+
+        ctx.initJS(args.listener);
+        return ctx;
+    }
+};
diff --git a/src/bun.js/node/node_os.zig b/src/bun.js/node/node_os.zig
index 4b37640cf..483acb3e2 100644
--- a/src/bun.js/node/node_os.zig
+++ b/src/bun.js/node/node_os.zig
@@ -16,7 +16,7 @@ pub const Os = struct {
     pub const code = @embedFile("../os.exports.js");
 
     pub fn create(globalObject: *JSC.JSGlobalObject) callconv(.C) JSC.JSValue {
-        const module = JSC.JSValue.createEmptyObject(globalObject, 20);
+        const module = JSC.JSValue.createEmptyObject(globalObject, 22);
 
         module.put(globalObject, JSC.ZigString.static("arch"), JSC.NewFunction(globalObject, JSC.ZigString.static("arch"), 0, arch, true));
         module.put(globalObject, JSC.ZigString.static("cpus"), JSC.NewFunction(globalObject, JSC.ZigString.static("cpus"), 0, cpus, true));
@@ -31,7 +31,6 @@ pub const Os = struct {
         module.put(globalObject, JSC.ZigString.static("platform"), JSC.NewFunction(globalObject, JSC.ZigString.static("platform"), 0, platform, true));
         module.put(globalObject, JSC.ZigString.static("release"), JSC.NewFunction(globalObject, JSC.ZigString.static("release"), 0, release, true));
         module.put(globalObject, JSC.ZigString.static("setPriority"), JSC.NewFunction(globalObject, JSC.ZigString.static("setPriority"), 2, setPriority, true));
-        module.put(globalObject, JSC.ZigString.static("tmpdir"), JSC.NewFunction(globalObject, JSC.ZigString.static("tmpdir"), 0, tmpdir, true));
         module.put(globalObject, JSC.ZigString.static("totalmem"), JSC.NewFunction(globalObject, JSC.ZigString.static("totalmem"), 0, totalmem, true));
         module.put(globalObject, JSC.ZigString.static("type"), JSC.NewFunction(globalObject, JSC.ZigString.static("type"), 0, Os.type, true));
         module.put(globalObject, JSC.ZigString.static("uptime"), JSC.NewFunction(globalObject, JSC.ZigString.static("uptime"), 0, uptime, true));
@@ -79,8 +78,8 @@ pub const Os = struct {
         return if (comptime Environment.isLinux)
             cpusImplLinux(globalThis) catch {
                 const err = JSC.SystemError{
-                    .message = JSC.ZigString.init("Failed to get cpu information"),
-                    .code = JSC.ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
+                    .message = bun.String.static("Failed to get cpu information"),
+                    .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
                 };
 
                 globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
@@ -89,8 +88,8 @@ pub const Os = struct {
         else if (comptime Environment.isMac)
             cpusImplDarwin(globalThis) catch {
                 const err = JSC.SystemError{
-                    .message = JSC.ZigString.init("Failed to get cpu information"),
-                    .code = JSC.ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
+                    .message = bun.String.static("Failed to get cpu information"),
+                    .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
                 };
 
                 globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
@@ -211,7 +210,7 @@ pub const Os = struct {
         if (local_bindings.host_processor_info(std.c.mach_host_self(), local_bindings.PROCESSOR_CPU_LOAD_INFO, &num_cpus, @ptrCast(*local_bindings.processor_info_array_t, &info), &info_size) != .SUCCESS) {
             return error.no_processor_info;
         }
-        defer _ = std.c.vm_deallocate(std.c.mach_task_self(), @ptrToInt(info), info_size);
+        defer _ = std.c.vm_deallocate(std.c.mach_task_self(), @intFromPtr(info), info_size);
 
         // Ensure we got the amount of data we expected to guard against buffer overruns
         if (info_size != C.PROCESSOR_CPU_LOAD_INFO_COUNT * num_cpus) {
@@ -319,11 +318,11 @@ pub const Os = struct {
             //info.put(globalThis, JSC.ZigString.static("syscall"), JSC.ZigString.init("uv_os_getpriority").withEncoding().toValueGC(globalThis));
 
             const err = JSC.SystemError{
-                .message = JSC.ZigString.init("A system error occurred: uv_os_getpriority returned ESRCH (no such process)"),
-                .code = JSC.ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
+                .message = bun.String.static("A system error occurred: uv_os_getpriority returned ESRCH (no such process)"),
+                .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
                 //.info = info,
                 .errno = -3,
-                .syscall = JSC.ZigString.init("uv_os_getpriority"),
+                .syscall = bun.String.static("uv_os_getpriority"),
             };
 
             globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
@@ -378,10 +377,10 @@ pub const Os = struct {
         const rc = C.getifaddrs(&interface_start);
         if (rc != 0) {
             const err = JSC.SystemError{
-                .message = JSC.ZigString.init("A system error occurred: getifaddrs returned an error"),
-                .code = JSC.ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
-                .errno = @enumToInt(std.os.errno(rc)),
-                .syscall = JSC.ZigString.init("getifaddrs"),
+                .message = bun.String.static("A system error occurred: getifaddrs returned an error"),
+                .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
+                .errno = @intFromEnum(std.os.errno(rc)),
+                .syscall = bun.String.static("getifaddrs"),
             };
 
             globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
@@ -461,7 +460,7 @@ pub const Os = struct {
                 var cidr = JSC.JSValue.null;
                 if (maybe_suffix) |suffix| {
                     //NOTE addr_str might not start at buf[0] due to slicing in formatIp
-                    const start = @ptrToInt(addr_str.ptr) - @ptrToInt(&buf[0]);
+                    const start = @intFromPtr(addr_str.ptr) - @intFromPtr(&buf[0]);
                     // Start writing the suffix immediately after the address
                     const suffix_str = std.fmt.bufPrint(buf[start + addr_str.len ..], "/{}", .{suffix}) catch unreachable;
                     // The full cidr value is the address + the suffix
@@ -485,7 +484,7 @@ pub const Os = struct {
                 std.os.AF.INET => JSC.ZigString.static("IPv4"),
                 std.os.AF.INET6 => JSC.ZigString.static("IPv6"),
                 else => JSC.ZigString.static("unknown"),
-            }).toValue(globalThis));
+            }).toValueGC(globalThis));
 
             // mac <string> The MAC address of the network interface
             {
@@ -592,11 +591,11 @@ pub const Os = struct {
         switch (errcode) {
             .SRCH => {
                 const err = JSC.SystemError{
-                    .message = JSC.ZigString.init("A system error occurred: uv_os_setpriority returned ESRCH (no such process)"),
-                    .code = JSC.ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
+                    .message = bun.String.static("A system error occurred: uv_os_setpriority returned ESRCH (no such process)"),
+                    .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
                     //.info = info,
                     .errno = -3,
-                    .syscall = JSC.ZigString.init("uv_os_setpriority"),
+                    .syscall = bun.String.static("uv_os_setpriority"),
                 };
 
                 globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
@@ -604,11 +603,11 @@ pub const Os = struct {
             },
             .ACCES => {
                 const err = JSC.SystemError{
-                    .message = JSC.ZigString.init("A system error occurred: uv_os_setpriority returned EACCESS (permission denied)"),
-                    .code = JSC.ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
+                    .message = bun.String.static("A system error occurred: uv_os_setpriority returned EACCESS (permission denied)"),
+                    .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_SYSTEM_ERROR))),
                     //.info = info,
                     .errno = -13,
-                    .syscall = JSC.ZigString.init("uv_os_setpriority"),
+                    .syscall = bun.String.static("uv_os_setpriority"),
                 };
 
                 globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
@@ -620,29 +619,6 @@ pub const Os = struct {
         return JSC.JSValue.jsUndefined();
     }
 
-    pub fn tmpdir(globalThis: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        JSC.markBinding(@src());
-
-        const dir: []const u8 = brk: {
-            if (comptime Environment.isWindows) {
-                if (bun.getenvZ("TEMP") orelse bun.getenvZ("TMP")) |tmpdir_| {
-                    break :brk tmpdir_;
-                }
-
-                if (bun.getenvZ("SYSTEMROOT") orelse bun.getenvZ("WINDIR")) |systemdir_| {
-                    break :brk systemdir_ ++ "\\temp";
-                }
-            } else {
-                const dir = bun.asByteSlice(bun.getenvZ("TMPDIR") orelse bun.getenvZ("TMP") orelse bun.getenvZ("TEMP") orelse "/tmp");
-                break :brk strings.withoutTrailingSlash(dir);
-            }
-
-            break :brk "unknown";
-        };
-
-        return JSC.ZigString.init(dir).withEncoding().toValueGC(globalThis);
-    }
-
     pub fn totalmem(_: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
         JSC.markBinding(@src());
 
diff --git a/src/bun.js/node/os/constants.zig b/src/bun.js/node/os/constants.zig
index a3508d383..9cf754e03 100644
--- a/src/bun.js/node/os/constants.zig
+++ b/src/bun.js/node/os/constants.zig
@@ -8,14 +8,14 @@ const ConstantType = enum { ERRNO, ERRNO_WIN, SIG, DLOPEN, OTHER };
 
 fn getErrnoConstant(comptime name: []const u8) ?comptime_int {
     return if (@hasField(std.os.E, name))
-        return @enumToInt(@field(std.os.E, name))
+        return @intFromEnum(@field(std.os.E, name))
     else
         return null;
 }
 
 fn getWindowsErrnoConstant(comptime name: []const u8) ?comptime_int {
     return if (@hasField(std.os.E, name))
-        return @enumToInt(@field(std.os.windows.ws2_32.WinsockError, name))
+        return @intFromEnum(@field(std.os.windows.ws2_32.WinsockError, name))
     else
         return null;
 }
diff --git a/src/bun.js/node/syscall.zig b/src/bun.js/node/syscall.zig
index 7b10a3028..5ff0b2f44 100644
--- a/src/bun.js/node/syscall.zig
+++ b/src/bun.js/node/syscall.zig
@@ -106,6 +106,10 @@ pub const Tag = enum(u8) {
     waitpid,
     posix_spawn,
     getaddrinfo,
+    writev,
+    pwritev,
+    readv,
+    preadv,
     pub var strings = std.EnumMap(Tag, JSC.C.JSStringRef).initFull(null);
 };
 const PathString = @import("root").bun.PathString;
@@ -202,7 +206,7 @@ pub fn openat(dirfd: bun.FileDescriptor, file_path: [:0]const u8, flags: JSC.Nod
             .SUCCESS => .{ .result = @intCast(bun.FileDescriptor, rc) },
             else => |err| .{
                 .err = .{
-                    .errno = @truncate(Syscall.Error.Int, @enumToInt(err)),
+                    .errno = @truncate(Syscall.Error.Int, @intFromEnum(err)),
                     .syscall = .open,
                 },
             },
@@ -218,7 +222,7 @@ pub fn openat(dirfd: bun.FileDescriptor, file_path: [:0]const u8, flags: JSC.Nod
             else => |err| {
                 return Maybe(std.os.fd_t){
                     .err = .{
-                        .errno = @truncate(Syscall.Error.Int, @enumToInt(err)),
+                        .errno = @truncate(Syscall.Error.Int, @intFromEnum(err)),
                         .syscall = .open,
                     },
                 };
@@ -253,14 +257,14 @@ pub fn closeAllowingStdoutAndStderr(fd: std.os.fd_t) ?Syscall.Error {
     if (comptime Environment.isMac) {
         // This avoids the EINTR problem.
         return switch (system.getErrno(system.@"close$NOCANCEL"(fd))) {
-            .BADF => Syscall.Error{ .errno = @enumToInt(os.E.BADF), .syscall = .close },
+            .BADF => Syscall.Error{ .errno = @intFromEnum(os.E.BADF), .syscall = .close },
             else => null,
         };
     }
 
     if (comptime Environment.isLinux) {
         return switch (linux.getErrno(linux.close(fd))) {
-            .BADF => Syscall.Error{ .errno = @enumToInt(os.E.BADF), .syscall = .close },
+            .BADF => Syscall.Error{ .errno = @intFromEnum(os.E.BADF), .syscall = .close },
             else => null,
         };
     }
@@ -302,6 +306,154 @@ pub fn write(fd: os.fd_t, bytes: []const u8) Maybe(usize) {
     }
 }
 
+fn veclen(buffers: anytype) usize {
+    var len: usize = 0;
+    for (buffers) |buffer| {
+        len += buffer.iov_len;
+    }
+    return len;
+}
+
+pub fn writev(fd: os.fd_t, buffers: []std.os.iovec) Maybe(usize) {
+    if (comptime Environment.isMac) {
+        const rc = writev_sym(fd, @ptrCast([*]std.os.iovec_const, buffers.ptr), @intCast(i32, buffers.len));
+        if (comptime Environment.allow_assert)
+            log("writev({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+        if (Maybe(usize).errnoSysFd(rc, .writev, fd)) |err| {
+            return err;
+        }
+
+        return Maybe(usize){ .result = @intCast(usize, rc) };
+    } else {
+        while (true) {
+            const rc = writev_sym(fd, @ptrCast([*]std.os.iovec_const, buffers.ptr), buffers.len);
+            if (comptime Environment.allow_assert)
+                log("writev({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+            if (Maybe(usize).errnoSysFd(rc, .writev, fd)) |err| {
+                if (err.getErrno() == .INTR) continue;
+                return err;
+            }
+
+            return Maybe(usize){ .result = @intCast(usize, rc) };
+        }
+        unreachable;
+    }
+}
+
+pub fn pwritev(fd: os.fd_t, buffers: []std.os.iovec, position: isize) Maybe(usize) {
+    if (comptime Environment.isMac) {
+        const rc = pwritev_sym(fd, @ptrCast([*]std.os.iovec_const, buffers.ptr), @intCast(i32, buffers.len), position);
+        if (comptime Environment.allow_assert)
+            log("pwritev({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+        if (Maybe(usize).errnoSysFd(rc, .pwritev, fd)) |err| {
+            return err;
+        }
+
+        return Maybe(usize){ .result = @intCast(usize, rc) };
+    } else {
+        while (true) {
+            const rc = pwritev_sym(fd, @ptrCast([*]std.os.iovec_const, buffers.ptr), buffers.len, position);
+            if (comptime Environment.allow_assert)
+                log("pwritev({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+            if (Maybe(usize).errnoSysFd(rc, .pwritev, fd)) |err| {
+                if (err.getErrno() == .INTR) continue;
+                return err;
+            }
+
+            return Maybe(usize){ .result = @intCast(usize, rc) };
+        }
+        unreachable;
+    }
+}
+
+pub fn readv(fd: os.fd_t, buffers: []std.os.iovec) Maybe(usize) {
+    if (comptime Environment.isMac) {
+        const rc = readv_sym(fd, buffers.ptr, @intCast(i32, buffers.len));
+        if (comptime Environment.allow_assert)
+            log("readv({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+        if (Maybe(usize).errnoSysFd(rc, .readv, fd)) |err| {
+            return err;
+        }
+
+        return Maybe(usize){ .result = @intCast(usize, rc) };
+    } else {
+        while (true) {
+            const rc = readv_sym(fd, buffers.ptr, buffers.len);
+            if (comptime Environment.allow_assert)
+                log("readv({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+            if (Maybe(usize).errnoSysFd(rc, .readv, fd)) |err| {
+                if (err.getErrno() == .INTR) continue;
+                return err;
+            }
+
+            return Maybe(usize){ .result = @intCast(usize, rc) };
+        }
+        unreachable;
+    }
+}
+
+pub fn preadv(fd: os.fd_t, buffers: []std.os.iovec, position: isize) Maybe(usize) {
+    if (comptime Environment.isMac) {
+        const rc = preadv_sym(fd, buffers.ptr, @intCast(i32, buffers.len), position);
+        if (comptime Environment.allow_assert)
+            log("preadv({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+        if (Maybe(usize).errnoSysFd(rc, .preadv, fd)) |err| {
+            return err;
+        }
+
+        return Maybe(usize){ .result = @intCast(usize, rc) };
+    } else {
+        while (true) {
+            const rc = preadv_sym(fd, buffers.ptr, buffers.len, position);
+            if (comptime Environment.allow_assert)
+                log("preadv({d}, {d}) = {d}", .{ fd, veclen(buffers), rc });
+
+            if (Maybe(usize).errnoSysFd(rc, .preadv, fd)) |err| {
+                if (err.getErrno() == .INTR) continue;
+                return err;
+            }
+
+            return Maybe(usize){ .result = @intCast(usize, rc) };
+        }
+        unreachable;
+    }
+}
+
+const preadv_sym = if (builtin.os.tag == .linux and builtin.link_libc)
+    std.os.linux.preadv
+else if (builtin.os.tag.isDarwin())
+    system.@"preadv$NOCANCEL"
+else
+    system.preadv;
+
+const readv_sym = if (builtin.os.tag == .linux and builtin.link_libc)
+    std.os.linux.readv
+else if (builtin.os.tag.isDarwin())
+    system.@"readv$NOCANCEL"
+else
+    system.readv;
+
+const pwritev_sym = if (builtin.os.tag == .linux and builtin.link_libc)
+    std.os.linux.pwritev
+else if (builtin.os.tag.isDarwin())
+    system.@"pwritev$NOCANCEL"
+else
+    system.pwritev;
+
+const writev_sym = if (builtin.os.tag == .linux and builtin.link_libc)
+    std.os.linux.writev
+else if (builtin.os.tag.isDarwin())
+    system.@"writev$NOCANCEL"
+else
+    system.writev;
+
 const pread_sym = if (builtin.os.tag == .linux and builtin.link_libc)
     sys.pread64
 else if (builtin.os.tag.isDarwin())
@@ -546,7 +698,7 @@ pub fn getFdPath(fd: fd_t, out_buffer: *[MAX_PATH_BYTES]u8) Maybe([]u8) {
         .macos, .ios, .watchos, .tvos => {
             // On macOS, we can use F.GETPATH fcntl command to query the OS for
             // the path to the file descriptor.
-            @memset(out_buffer, 0, MAX_PATH_BYTES);
+            @memset(out_buffer[0..MAX_PATH_BYTES], 0);
             if (Maybe([]u8).errnoSys(system.fcntl(fd, os.F.GETPATH, out_buffer), .fcntl)) |err| {
                 return err;
             }
@@ -594,7 +746,7 @@ fn mmap(
     const fail = std.c.MAP.FAILED;
     if (rc == fail) {
         return Maybe([]align(mem.page_size) u8){
-            .err = .{ .errno = @truncate(Syscall.Error.Int, @enumToInt(std.c.getErrno(@bitCast(i64, @ptrToInt(fail))))), .syscall = .mmap },
+            .err = .{ .errno = @truncate(Syscall.Error.Int, @intFromEnum(std.c.getErrno(@bitCast(i64, @intFromPtr(fail))))), .syscall = .mmap },
         };
     }
 
@@ -643,16 +795,16 @@ pub fn munmap(memory: []align(mem.page_size) const u8) Maybe(void) {
 pub const Error = struct {
     const max_errno_value = brk: {
         const errno_values = std.enums.values(os.E);
-        var err = @enumToInt(os.E.SUCCESS);
+        var err = @intFromEnum(os.E.SUCCESS);
         for (errno_values) |errn| {
-            err = @max(err, @enumToInt(errn));
+            err = @max(err, @intFromEnum(errn));
         }
         break :brk err;
     };
     pub const Int: type = std.math.IntFittingRange(0, max_errno_value + 5);
 
     errno: Int,
-    syscall: Syscall.Tag = @intToEnum(Syscall.Tag, 0),
+    syscall: Syscall.Tag = @enumFromInt(Syscall.Tag, 0),
     path: []const u8 = "",
     fd: i32 = -1,
 
@@ -661,7 +813,7 @@ pub const Error = struct {
     }
 
     pub fn fromCode(errno: os.E, syscall: Syscall.Tag) Error {
-        return .{ .errno = @truncate(Int, @enumToInt(errno)), .syscall = syscall };
+        return .{ .errno = @truncate(Int, @intFromEnum(errno)), .syscall = syscall };
     }
 
     pub fn format(self: Error, comptime fmt: []const u8, opts: std.fmt.FormatOptions, writer: anytype) !void {
@@ -672,16 +824,16 @@ pub const Error = struct {
 
     pub const retry = Error{
         .errno = if (Environment.isLinux)
-            @intCast(Int, @enumToInt(os.E.AGAIN))
+            @intCast(Int, @intFromEnum(os.E.AGAIN))
         else if (Environment.isMac)
-            @intCast(Int, @enumToInt(os.E.WOULDBLOCK))
+            @intCast(Int, @intFromEnum(os.E.WOULDBLOCK))
         else
-            @intCast(Int, @enumToInt(os.E.INTR)),
+            @intCast(Int, @intFromEnum(os.E.INTR)),
         .syscall = .retry,
     };
 
     pub inline fn getErrno(this: Error) os.E {
-        return @intToEnum(os.E, this.errno);
+        return @enumFromInt(os.E, this.errno);
     }
 
     pub inline fn withPath(this: Error, path: anytype) Error {
@@ -721,20 +873,20 @@ pub const Error = struct {
     pub fn toSystemError(this: Error) SystemError {
         var err = SystemError{
             .errno = @as(c_int, this.errno) * -1,
-            .syscall = JSC.ZigString.init(@tagName(this.syscall)),
+            .syscall = bun.String.static(@tagName(this.syscall)),
         };
 
         // errno label
         if (this.errno > 0 and this.errno < C.SystemErrno.max) {
-            const system_errno = @intToEnum(C.SystemErrno, this.errno);
-            err.code = JSC.ZigString.init(@tagName(system_errno));
+            const system_errno = @enumFromInt(C.SystemErrno, this.errno);
+            err.code = bun.String.static(@tagName(system_errno));
             if (C.SystemErrno.labels.get(system_errno)) |label| {
-                err.message = JSC.ZigString.init(label);
+                err.message = bun.String.static(label);
             }
         }
 
         if (this.path.len > 0) {
-            err.path = JSC.ZigString.init(this.path);
+            err.path = bun.String.create(this.path);
         }
 
         if (this.fd != -1) {
diff --git a/src/bun.js/node/types.zig b/src/bun.js/node/types.zig
index 1fe378a84..642039ba5 100644
--- a/src/bun.js/node/types.zig
+++ b/src/bun.js/node/types.zig
@@ -93,6 +93,10 @@ pub fn Maybe(comptime ResultType: type) type {
                         return JSC.JSValue.jsUndefined();
                     }
 
+                    if (comptime ReturnType == JSC.JSValue) {
+                        return r;
+                    }
+
                     if (comptime ReturnType == JSC.ArrayBuffer) {
                         return r.toJS(globalThis, null);
                     }
@@ -135,7 +139,7 @@ pub fn Maybe(comptime ResultType: type) type {
         pub inline fn getErrno(this: @This()) os.E {
             return switch (this) {
                 .result => os.E.SUCCESS,
-                .err => |err| @intToEnum(os.E, err.errno),
+                .err => |err| @enumFromInt(os.E, err.errno),
             };
         }
 
@@ -144,7 +148,7 @@ pub fn Maybe(comptime ResultType: type) type {
                 .SUCCESS => null,
                 else => |err| @This(){
                     // always truncate
-                    .err = .{ .errno = @truncate(Syscall.Error.Int, @enumToInt(err)) },
+                    .err = .{ .errno = @truncate(Syscall.Error.Int, @intFromEnum(err)) },
                 },
             };
         }
@@ -154,7 +158,7 @@ pub fn Maybe(comptime ResultType: type) type {
                 .SUCCESS => null,
                 else => |err| @This(){
                     // always truncate
-                    .err = .{ .errno = @truncate(Syscall.Error.Int, @enumToInt(err)), .syscall = syscall },
+                    .err = .{ .errno = @truncate(Syscall.Error.Int, @intFromEnum(err)), .syscall = syscall },
                 },
             };
         }
@@ -165,7 +169,7 @@ pub fn Maybe(comptime ResultType: type) type {
                 else => |err| @This(){
                     // always truncate
                     .err = .{
-                        .errno = @truncate(Syscall.Error.Int, @enumToInt(err)),
+                        .errno = @truncate(Syscall.Error.Int, @intFromEnum(err)),
                         .syscall = syscall,
                         .fd = @intCast(i32, fd),
                     },
@@ -178,7 +182,7 @@ pub fn Maybe(comptime ResultType: type) type {
                 .SUCCESS => null,
                 else => |err| @This(){
                     // always truncate
-                    .err = .{ .errno = @truncate(Syscall.Error.Int, @enumToInt(err)), .syscall = syscall, .path = bun.asByteSlice(path) },
+                    .err = .{ .errno = @truncate(Syscall.Error.Int, @intFromEnum(err)), .syscall = syscall, .path = bun.asByteSlice(path) },
                 },
             };
         }
@@ -520,8 +524,8 @@ pub const Encoding = enum(u8) {
         switch (encoding) {
             .base64 => {
                 var base64: [std.base64.standard.Encoder.calcSize(size)]u8 = undefined;
-                const result = JSC.ZigString.init(std.base64.standard.Encoder.encode(&base64, input)).toValueGC(globalThis);
-                return result;
+                const len = bun.base64.encode(&base64, input);
+                return JSC.ZigString.init(base64[0..len]).toValueGC(globalThis);
             },
             .base64url => {
                 var buf: [std.base64.url_safe.Encoder.calcSize(size) + "data:;base64,".len]u8 = undefined;
@@ -537,9 +541,18 @@ pub const Encoding = enum(u8) {
                 const result = JSC.ZigString.init(out).toValueGC(globalThis);
                 return result;
             },
-            else => {
-                globalThis.throwInvalidArguments("Unexpected encoding", .{});
-                return JSC.JSValue.zero;
+            .buffer => {
+                return JSC.ArrayBuffer.createBuffer(globalThis, input);
+            },
+
+            inline else => |enc| {
+                const res = JSC.WebCore.Encoder.toString(input.ptr, size, globalThis, enc);
+                if (res.isError()) {
+                    globalThis.throwValue(res);
+                    return .zero;
+                }
+
+                return res;
             },
         }
     }
@@ -567,9 +580,18 @@ pub const Encoding = enum(u8) {
                 const result = JSC.ZigString.init(out).toValueGC(globalThis);
                 return result;
             },
-            else => {
-                globalThis.throwInvalidArguments("Unexpected encoding", .{});
-                return JSC.JSValue.zero;
+            .buffer => {
+                return JSC.ArrayBuffer.createBuffer(globalThis, input);
+            },
+            inline else => |enc| {
+                const res = JSC.WebCore.Encoder.toString(input.ptr, input.len, globalThis, enc);
+
+                if (res.isError()) {
+                    globalThis.throwValue(res);
+                    return .zero;
+                }
+
+                return res;
             },
         }
     }
@@ -632,7 +654,7 @@ pub const PathLike = union(Tag) {
             }
         }
 
-        @memcpy(buf, sliced.ptr, sliced.len);
+        @memcpy(buf[0..sliced.len], sliced);
         buf[sliced.len] = 0;
         return buf[0..sliced.len :0];
     }
@@ -809,6 +831,50 @@ pub const Valid = struct {
     }
 };
 
+pub const VectorArrayBuffer = struct {
+    value: JSC.JSValue,
+    buffers: std.ArrayList(std.os.iovec),
+
+    pub fn toJS(this: VectorArrayBuffer, _: *JSC.JSGlobalObject) JSC.JSValue {
+        return this.value;
+    }
+
+    pub fn fromJS(globalObject: *JSC.JSGlobalObject, val: JSC.JSValue, exception: JSC.C.ExceptionRef, allocator: std.mem.Allocator) ?VectorArrayBuffer {
+        if (!val.jsType().isArrayLike()) {
+            JSC.throwInvalidArguments("Expected ArrayBufferView[]", .{}, globalObject, exception);
+            return null;
+        }
+
+        var bufferlist = std.ArrayList(std.os.iovec).init(allocator);
+        var i: usize = 0;
+        const len = val.getLength(globalObject);
+        bufferlist.ensureTotalCapacityPrecise(len) catch @panic("Failed to allocate memory for ArrayBuffer[]");
+
+        while (i < len) {
+            const element = val.getIndex(globalObject, @truncate(u32, i));
+
+            if (!element.isCell()) {
+                JSC.throwInvalidArguments("Expected ArrayBufferView[]", .{}, globalObject, exception);
+                return null;
+            }
+
+            const array_buffer = element.asArrayBuffer(globalObject) orelse {
+                JSC.throwInvalidArguments("Expected ArrayBufferView[]", .{}, globalObject, exception);
+                return null;
+            };
+
+            var buf = array_buffer.byteSlice();
+            bufferlist.append(std.os.iovec{
+                .iov_base = buf.ptr,
+                .iov_len = buf.len,
+            }) catch @panic("Failed to allocate memory for ArrayBuffer[]");
+            i += 1;
+        }
+
+        return VectorArrayBuffer{ .value = val, .buffers = bufferlist };
+    }
+};
+
 pub const ArgumentsSlice = struct {
     remaining: []const JSC.JSValue,
     vm: *JSC.VirtualMachine,
@@ -904,7 +970,7 @@ pub fn timeLikeFromJS(globalThis: *JSC.JSGlobalObject, value: JSC.JSValue, _: JS
             return null;
         }
 
-        return @truncate(TimeLike, @floatToInt(i64, milliseconds / @as(f64, std.time.ms_per_s)));
+        return @truncate(TimeLike, @intFromFloat(i64, milliseconds / @as(f64, std.time.ms_per_s)));
     }
 
     if (!value.isNumber() and !value.isString()) {
@@ -916,7 +982,7 @@ pub fn timeLikeFromJS(globalThis: *JSC.JSGlobalObject, value: JSC.JSValue, _: JS
         return null;
     }
 
-    return @truncate(TimeLike, @floatToInt(i64, seconds));
+    return @truncate(TimeLike, @intFromFloat(i64, seconds));
 }
 
 pub fn modeFromJS(ctx: JSC.C.JSContextRef, value: JSC.JSValue, exception: JSC.C.ExceptionRef) ?Mode {
@@ -968,8 +1034,8 @@ pub const PathOrFileDescriptor = union(Tag) {
 
     pub fn hash(this: JSC.Node.PathOrFileDescriptor) u64 {
         return switch (this) {
-            .path => std.hash.Wyhash.hash(0, this.path.slice()),
-            .fd => std.hash.Wyhash.hash(0, std.mem.asBytes(&this.fd)),
+            .path => bun.hash(this.path.slice()),
+            .fd => bun.hash(std.mem.asBytes(&this.fd)),
         };
     }
 
@@ -1104,7 +1170,7 @@ pub const FileSystemFlags = enum(Mode) {
     pub fn fromJS(ctx: JSC.C.JSContextRef, val: JSC.JSValue, exception: JSC.C.ExceptionRef) ?FileSystemFlags {
         if (val.isNumber()) {
             const number = val.coerce(i32, ctx);
-            return @intToEnum(FileSystemFlags, @intCast(Mode, @max(number, 0)));
+            return @enumFromInt(FileSystemFlags, @intCast(Mode, @max(number, 0)));
         }
 
         const jsType = val.jsType();
@@ -1160,7 +1226,7 @@ pub const FileSystemFlags = enum(Mode) {
                 return null;
             };
 
-            return @intToEnum(FileSystemFlags, @intCast(Mode, flags));
+            return @enumFromInt(FileSystemFlags, @intCast(Mode, flags));
         }
 
         return null;
@@ -1172,7 +1238,7 @@ pub const Date = enum(u64) {
     _,
 
     pub fn toJS(this: Date, ctx: JSC.C.JSContextRef, exception: JSC.C.ExceptionRef) JSC.C.JSValueRef {
-        const seconds = @floatCast(f64, @intToFloat(f64, @enumToInt(this)) * 1000.0);
+        const seconds = @floatCast(f64, @floatFromInt(f64, @intFromEnum(this)) * 1000.0);
         const unix_timestamp = JSC.JSValue.jsNumber(seconds);
         const array: [1]JSC.C.JSValueRef = .{unix_timestamp.asObjectRef()};
         const obj = JSC.C.JSObjectMakeDate(ctx, 1, &array, exception);
@@ -1219,12 +1285,12 @@ fn StatsDataType(comptime T: type) type {
                 .size = @truncate(T, @intCast(i64, stat_.size)),
                 .blksize = @truncate(T, @intCast(i64, stat_.blksize)),
                 .blocks = @truncate(T, @intCast(i64, stat_.blocks)),
-                .atime_ms = (@intToFloat(f64, @max(atime.tv_sec, 0)) * std.time.ms_per_s) + (@intToFloat(f64, @intCast(usize, @max(atime.tv_nsec, 0))) / std.time.ns_per_ms),
-                .mtime_ms = (@intToFloat(f64, @max(mtime.tv_sec, 0)) * std.time.ms_per_s) + (@intToFloat(f64, @intCast(usize, @max(mtime.tv_nsec, 0))) / std.time.ns_per_ms),
-                .ctime_ms = (@intToFloat(f64, @max(ctime.tv_sec, 0)) * std.time.ms_per_s) + (@intToFloat(f64, @intCast(usize, @max(ctime.tv_nsec, 0))) / std.time.ns_per_ms),
-                .atime = @intToEnum(Date, @intCast(u64, @max(atime.tv_sec, 0))),
-                .mtime = @intToEnum(Date, @intCast(u64, @max(mtime.tv_sec, 0))),
-                .ctime = @intToEnum(Date, @intCast(u64, @max(ctime.tv_sec, 0))),
+                .atime_ms = (@floatFromInt(f64, @max(atime.tv_sec, 0)) * std.time.ms_per_s) + (@floatFromInt(f64, @intCast(usize, @max(atime.tv_nsec, 0))) / std.time.ns_per_ms),
+                .mtime_ms = (@floatFromInt(f64, @max(mtime.tv_sec, 0)) * std.time.ms_per_s) + (@floatFromInt(f64, @intCast(usize, @max(mtime.tv_nsec, 0))) / std.time.ns_per_ms),
+                .ctime_ms = (@floatFromInt(f64, @max(ctime.tv_sec, 0)) * std.time.ms_per_s) + (@floatFromInt(f64, @intCast(usize, @max(ctime.tv_nsec, 0))) / std.time.ns_per_ms),
+                .atime = @enumFromInt(Date, @intCast(u64, @max(atime.tv_sec, 0))),
+                .mtime = @enumFromInt(Date, @intCast(u64, @max(mtime.tv_sec, 0))),
+                .ctime = @enumFromInt(Date, @intCast(u64, @max(ctime.tv_sec, 0))),
 
                 // Linux doesn't include this info in stat
                 // maybe it does in statx, but do you really need birthtime? If you do please file an issue.
@@ -1234,9 +1300,9 @@ fn StatsDataType(comptime T: type) type {
                     @truncate(T, @intCast(i64, if (stat_.birthtime().tv_nsec > 0) (@intCast(usize, stat_.birthtime().tv_nsec) / std.time.ns_per_ms) else 0)),
 
                 .birthtime = if (Environment.isLinux)
-                    @intToEnum(Date, 0)
+                    @enumFromInt(Date, 0)
                 else
-                    @intToEnum(Date, @intCast(u64, @max(stat_.birthtime().tv_sec, 0))),
+                    @enumFromInt(Date, @intCast(u64, @max(stat_.birthtime().tv_sec, 0))),
             };
         }
     };
@@ -1426,49 +1492,49 @@ pub const Dirent = struct {
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.BlockDevice);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.block_device);
     }
     pub fn isCharacterDevice(
         this: *Dirent,
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.CharacterDevice);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.character_device);
     }
     pub fn isDirectory(
         this: *Dirent,
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.Directory);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.directory);
     }
     pub fn isFIFO(
         this: *Dirent,
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.NamedPipe or this.kind == std.fs.File.Kind.EventPort);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.named_pipe or this.kind == std.fs.File.Kind.event_port);
     }
     pub fn isFile(
         this: *Dirent,
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.File);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.file);
     }
     pub fn isSocket(
         this: *Dirent,
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.UnixDomainSocket);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.unix_domain_socket);
     }
     pub fn isSymbolicLink(
         this: *Dirent,
         _: *JSC.JSGlobalObject,
         _: *JSC.CallFrame,
     ) callconv(.C) JSC.JSValue {
-        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.SymLink);
+        return JSC.JSValue.jsBoolean(this.kind == std.fs.File.Kind.sym_link);
     }
 
     pub fn finalize(this: *Dirent) callconv(.C) void {
@@ -1490,14 +1556,14 @@ pub const Emitter = struct {
             pub fn append(this: *List, allocator: std.mem.Allocator, ctx: JSC.C.JSContextRef, listener: Listener) !void {
                 JSC.C.JSValueProtect(ctx, listener.callback.asObjectRef());
                 try this.list.append(allocator, listener);
-                this.once_count +|= @as(u32, @boolToInt(listener.once));
+                this.once_count +|= @as(u32, @intFromBool(listener.once));
             }
 
             pub fn prepend(this: *List, allocator: std.mem.Allocator, ctx: JSC.C.JSContextRef, listener: Listener) !void {
                 JSC.C.JSValueProtect(ctx, listener.callback.asObjectRef());
                 try this.list.ensureUnusedCapacity(allocator, 1);
                 this.list.insertAssumeCapacity(0, listener);
-                this.once_count +|= @as(u32, @boolToInt(listener.once));
+                this.once_count +|= @as(u32, @intFromBool(listener.once));
             }
 
             // removeListener() will remove, at most, one instance of a listener from the
@@ -1510,7 +1576,7 @@ pub const Emitter = struct {
                 for (callbacks, 0..) |item, i| {
                     if (callback.eqlValue(item)) {
                         JSC.C.JSValueUnprotect(ctx, callback.asObjectRef());
-                        this.once_count -|= @as(u32, @boolToInt(this.list.items(.once)[i]));
+                        this.once_count -|= @as(u32, @intFromBool(this.list.items(.once)[i]));
                         this.list.orderedRemove(i);
                         return true;
                     }
@@ -2154,7 +2220,9 @@ pub const Process = struct {
         }
     }
 
-    pub fn exit(_: *JSC.JSGlobalObject, code: i32) callconv(.C) void {
+    pub fn exit(globalObject: *JSC.JSGlobalObject, code: i32) callconv(.C) void {
+        globalObject.bunVM().onExit();
+
         std.os.exit(@truncate(u8, @intCast(u32, @max(code, 0))));
     }
 
diff --git a/src/bun.js/rare_data.zig b/src/bun.js/rare_data.zig
index ddda96bf4..3b29896a4 100644
--- a/src/bun.js/rare_data.zig
+++ b/src/bun.js/rare_data.zig
@@ -9,6 +9,7 @@ const std = @import("std");
 const BoringSSL = @import("root").bun.BoringSSL;
 const bun = @import("root").bun;
 const WebSocketClientMask = @import("../http/websocket_http_client.zig").Mask;
+const UUID = @import("./uuid.zig");
 
 boring_ssl_engine: ?*BoringSSL.ENGINE = null,
 editor_context: EditorContext = EditorContext{},
@@ -47,13 +48,16 @@ pub fn filePolls(this: *RareData, vm: *JSC.VirtualMachine) *JSC.FilePoll.HiveArr
     };
 }
 
-pub fn nextUUID(this: *RareData) [16]u8 {
+pub fn nextUUID(this: *RareData) UUID {
     if (this.entropy_cache == null) {
         this.entropy_cache = default_allocator.create(EntropyCache) catch unreachable;
         this.entropy_cache.?.init();
     }
 
-    return this.entropy_cache.?.get();
+    this.entropy_cache.?.fill();
+
+    const bytes = this.entropy_cache.?.get();
+    return UUID.initWith(&bytes);
 }
 
 pub fn entropySlice(this: *RareData, len: usize) []u8 {
diff --git a/src/bun.js/test/expect.zig b/src/bun.js/test/expect.zig
new file mode 100644
index 000000000..e0833f8ed
--- /dev/null
+++ b/src/bun.js/test/expect.zig
@@ -0,0 +1,3328 @@
+const std = @import("std");
+const bun = @import("root").bun;
+const default_allocator = bun.default_allocator;
+const string = bun.string;
+const MutableString = bun.MutableString;
+const strings = bun.strings;
+const Output = bun.Output;
+const jest = bun.JSC.Jest;
+const Jest = jest.Jest;
+const TestRunner = jest.TestRunner;
+const DescribeScope = jest.DescribeScope;
+const JSC = bun.JSC;
+const VirtualMachine = JSC.VirtualMachine;
+const JSGlobalObject = JSC.JSGlobalObject;
+const JSValue = JSC.JSValue;
+const JSInternalPromise = JSC.JSInternalPromise;
+const JSPromise = JSC.JSPromise;
+const JSType = JSValue.JSType;
+const JSError = JSC.JSError;
+const JSObject = JSC.JSObject;
+const CallFrame = JSC.CallFrame;
+const ZigString = JSC.ZigString;
+const Environment = bun.Environment;
+const DiffFormatter = @import("./diff_format.zig").DiffFormatter;
+
+pub const Counter = struct {
+    expected: u32 = 0,
+    actual: u32 = 0,
+};
+
+pub var active_test_expectation_counter: Counter = .{};
+
+/// https://jestjs.io/docs/expect
+// To support async tests, we need to track the test ID
+pub const Expect = struct {
+    pub usingnamespace JSC.Codegen.JSExpect;
+
+    test_id: TestRunner.Test.ID,
+    scope: *DescribeScope,
+    flags: Flags = .{},
+
+    pub const Flags = packed struct {
+        promise: enum(u2) {
+            resolves,
+            rejects,
+            none,
+        } = .none,
+        not: bool = false,
+    };
+
+    pub fn getSignature(comptime matcher_name: string, comptime args: string, comptime not: bool) string {
+        const received = "<d>expect(<r><red>received<r><d>).<r>";
+        comptime if (not) {
+            return received ++ "not<d>.<r>" ++ matcher_name ++ "<d>(<r>" ++ args ++ "<d>)<r>";
+        };
+        return received ++ matcher_name ++ "<d>(<r>" ++ args ++ "<d>)<r>";
+    }
+
+    pub fn getFullSignature(comptime matcher: string, comptime args: string, comptime flags: Flags) string {
+        const fmt = "<d>expect(<r><red>received<r><d>).<r>" ++ if (flags.promise != .none)
+            switch (flags.promise) {
+                .resolves => if (flags.not) "resolves<d>.<r>not<d>.<r>" else "resolves<d>.<r>",
+                .rejects => if (flags.not) "rejects<d>.<r>not<d>.<r>" else "rejects<d>.<r>",
+                else => unreachable,
+            }
+        else if (flags.not) "not<d>.<r>" else "";
+        return fmt ++ matcher ++ "<d>(<r>" ++ args ++ "<d>)<r>";
+    }
+
+    pub fn getNot(this: *Expect, thisValue: JSValue, _: *JSGlobalObject) callconv(.C) JSValue {
+        this.flags.not = !this.flags.not;
+        return thisValue;
+    }
+
+    pub fn getResolves(this: *Expect, thisValue: JSValue, globalThis: *JSGlobalObject) callconv(.C) JSValue {
+        this.flags.promise = switch (this.flags.promise) {
+            .resolves, .none => .resolves,
+            .rejects => {
+                globalThis.throw("Cannot chain .resolves() after .rejects()", .{});
+                return .zero;
+            },
+        };
+
+        return thisValue;
+    }
+
+    pub fn getRejects(this: *Expect, thisValue: JSValue, globalThis: *JSGlobalObject) callconv(.C) JSValue {
+        this.flags.promise = switch (this.flags.promise) {
+            .none, .rejects => .rejects,
+            .resolves => {
+                globalThis.throw("Cannot chain .rejects() after .resolves()", .{});
+                return .zero;
+            },
+        };
+
+        return thisValue;
+    }
+
+    pub fn getValue(this: *Expect, globalThis: *JSGlobalObject, thisValue: JSValue, comptime matcher_name: string, comptime matcher_args: string) ?JSValue {
+        if (this.scope.tests.items.len <= this.test_id) {
+            globalThis.throw("{s}() must be called in a test", .{matcher_name});
+            return null;
+        }
+
+        const value = Expect.capturedValueGetCached(thisValue) orelse {
+            globalThis.throw("Internal error: the expect(value) was garbage collected but it should not have been!", .{});
+            return null;
+        };
+        value.ensureStillAlive();
+
+        switch (this.flags.promise) {
+            inline .resolves, .rejects => |resolution| {
+                if (value.asAnyPromise()) |promise| {
+                    var vm = globalThis.vm();
+                    promise.setHandled(vm);
+
+                    const now = std.time.Instant.now() catch unreachable;
+                    const pending_test = Jest.runner.?.pending_test.?;
+                    const elapsed = @divFloor(now.since(pending_test.started_at), std.time.ns_per_ms);
+                    const remaining = @truncate(u32, Jest.runner.?.last_test_timeout_timer_duration -| elapsed);
+
+                    if (!globalThis.bunVM().waitForPromiseWithTimeout(promise, remaining)) {
+                        pending_test.timeout();
+                        return null;
+                    }
+
+                    const newValue = promise.result(vm);
+                    switch (promise.status(vm)) {
+                        .Fulfilled => switch (comptime resolution) {
+                            .resolves => {},
+                            .rejects => {
+                                if (this.flags.not) {
+                                    const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = true, .promise = .rejects });
+                                    const fmt = signature ++ "\n\nExpected promise that rejects<r>\nReceived promise that resolved: <red>{any}<r>\n";
+                                    var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                                    globalThis.throwPretty(fmt, .{newValue.toFmt(globalThis, &formatter)});
+                                    return null;
+                                }
+                                const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = false, .promise = .rejects });
+                                const fmt = signature ++ "\n\nExpected promise that rejects<r>\nReceived promise that resolved: <red>{any}<r>\n";
+                                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                                globalThis.throwPretty(fmt, .{newValue.toFmt(globalThis, &formatter)});
+                                return null;
+                            },
+                            .none => unreachable,
+                        },
+                        .Rejected => switch (comptime resolution) {
+                            .rejects => {},
+                            .resolves => {
+                                if (this.flags.not) {
+                                    const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = true, .promise = .resolves });
+                                    const fmt = signature ++ "\n\nExpected promise that resolves<r>\nReceived promise that rejected: <red>{any}<r>\n";
+                                    var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                                    globalThis.throwPretty(fmt, .{newValue.toFmt(globalThis, &formatter)});
+                                    return null;
+                                }
+                                const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = false, .promise = .resolves });
+                                const fmt = signature ++ "\n\nExpected promise that resolves<r>\nReceived promise that rejected: <red>{any}<r>\n";
+                                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                                globalThis.throwPretty(fmt, .{newValue.toFmt(globalThis, &formatter)});
+                                return null;
+                            },
+                            .none => unreachable,
+                        },
+                        .Pending => unreachable,
+                    }
+
+                    newValue.ensureStillAlive();
+                    return newValue;
+                } else {
+                    switch (this.flags.promise) {
+                        .resolves => {
+                            if (this.flags.not) {
+                                const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = true, .promise = .resolves });
+                                const fmt = signature ++ "\n\nExpected promise<r>\nReceived: <red>{any}<r>\n";
+                                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                                globalThis.throwPretty(fmt, .{value.toFmt(globalThis, &formatter)});
+                                return null;
+                            }
+                            const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = false, .promise = .resolves });
+                            const fmt = signature ++ "\n\nExpected promise<r>\nReceived: <red>{any}<r>\n";
+                            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                            globalThis.throwPretty(fmt, .{value.toFmt(globalThis, &formatter)});
+                            return null;
+                        },
+                        .rejects => {
+                            if (this.flags.not) {
+                                const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = true, .promise = .rejects });
+                                const fmt = signature ++ "\n\nExpected promise<r>\nReceived: <red>{any}<r>\n";
+                                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                                globalThis.throwPretty(fmt, .{value.toFmt(globalThis, &formatter)});
+                                return null;
+                            }
+                            const signature = comptime getFullSignature(matcher_name, matcher_args, .{ .not = false, .promise = .rejects });
+                            const fmt = signature ++ "\n\nExpected promise<r>\nReceived: <red>{any}<r>\n";
+                            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+                            globalThis.throwPretty(fmt, .{value.toFmt(globalThis, &formatter)});
+                            return null;
+                        },
+                        .none => unreachable,
+                    }
+                }
+            },
+            else => {},
+        }
+
+        return value;
+    }
+
+    pub fn getSnapshotName(this: *Expect, allocator: std.mem.Allocator, hint: string) ![]const u8 {
+        const test_name = this.scope.tests.items[this.test_id].label;
+
+        var length: usize = 0;
+        var curr_scope: ?*DescribeScope = this.scope;
+        while (curr_scope) |scope| {
+            if (scope.label.len > 0) {
+                length += scope.label.len + 1;
+            }
+            curr_scope = scope.parent;
+        }
+        length += test_name.len;
+        if (hint.len > 0) {
+            length += hint.len + 2;
+        }
+
+        var buf = try allocator.alloc(u8, length);
+
+        var index = buf.len;
+        if (hint.len > 0) {
+            index -= hint.len;
+            bun.copy(u8, buf[index..], hint);
+            index -= test_name.len + 2;
+            bun.copy(u8, buf[index..], test_name);
+            bun.copy(u8, buf[index + test_name.len ..], ": ");
+        } else {
+            index -= test_name.len;
+            bun.copy(u8, buf[index..], test_name);
+        }
+        // copy describe scopes in reverse order
+        curr_scope = this.scope;
+        while (curr_scope) |scope| {
+            if (scope.label.len > 0) {
+                index -= scope.label.len + 1;
+                bun.copy(u8, buf[index..], scope.label);
+                buf[index + scope.label.len] = ' ';
+            }
+            curr_scope = scope.parent;
+        }
+
+        return buf;
+    }
+
+    pub fn finalize(
+        this: *Expect,
+    ) callconv(.C) void {
+        VirtualMachine.get().allocator.destroy(this);
+    }
+
+    pub fn call(globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        const arguments = callframe.arguments(1);
+        const value = if (arguments.len < 1) JSC.JSValue.jsUndefined() else arguments.ptr[0];
+
+        var expect = globalObject.bunVM().allocator.create(Expect) catch unreachable;
+
+        if (Jest.runner.?.pending_test == null) {
+            const err = globalObject.createErrorInstance("expect() must be called in a test", .{});
+            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
+            globalObject.throwValue(err);
+            return .zero;
+        }
+
+        expect.* = .{
+            .scope = Jest.runner.?.pending_test.?.describe,
+            .test_id = Jest.runner.?.pending_test.?.test_id,
+        };
+        const expect_js_value = expect.toJS(globalObject);
+        expect_js_value.ensureStillAlive();
+        Expect.capturedValueSetCached(expect_js_value, globalObject, value);
+        expect_js_value.ensureStillAlive();
+        expect.postMatch(globalObject);
+        return expect_js_value;
+    }
+
+    pub fn constructor(
+        globalObject: *JSC.JSGlobalObject,
+        _: *JSC.CallFrame,
+    ) callconv(.C) ?*Expect {
+        globalObject.throw("expect() cannot be called with new", .{});
+        return null;
+    }
+
+    /// Object.is()
+    pub fn toBe(
+        this: *Expect,
+        globalObject: *JSC.JSGlobalObject,
+        callframe: *JSC.CallFrame,
+    ) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+        const thisValue = callframe.this();
+        const arguments_ = callframe.arguments(1);
+        const arguments = arguments_.ptr[0..arguments_.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBe() takes 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+        const right = arguments[0];
+        right.ensureStillAlive();
+        const left = this.getValue(globalObject, thisValue, "toBe", "<green>expected<r>") orelse return .zero;
+
+        const not = this.flags.not;
+        var pass = right.isSameValue(left, globalObject);
+        if (comptime Environment.allow_assert) {
+            std.debug.assert(pass == JSC.C.JSValueIsStrictEqual(globalObject, right.asObjectRef(), left.asObjectRef()));
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        if (not) {
+            const signature = comptime getSignature("toBe", "<green>expected<r>", true);
+            const fmt = signature ++ "\n\nExpected: not <green>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{right.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{right.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toBe", "<green>expected<r>", false);
+        if (left.deepEquals(right, globalObject) or left.strictDeepEquals(right, globalObject)) {
+            const fmt = signature ++
+                "\n\n<d>If this test should pass, replace \"toBe\" with \"toEqual\" or \"toStrictEqual\"<r>" ++
+                "\n\nExpected: <green>{any}<r>\n" ++
+                "Received: serializes to the same string\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{right.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{right.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        if (right.isString() and left.isString()) {
+            const diff_format = DiffFormatter{
+                .expected = right,
+                .received = left,
+                .globalObject = globalObject,
+                .not = not,
+            };
+            const fmt = signature ++ "\n\n{any}\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{diff_format});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{diff_format});
+            return .zero;
+        }
+
+        const fmt = signature ++ "\n\nExpected: <green>{any}<r>\nReceived: <red>{any}<r>\n";
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{
+                right.toFmt(globalObject, &formatter),
+                left.toFmt(globalObject, &formatter),
+            });
+            return .zero;
+        }
+        globalObject.throw(Output.prettyFmt(fmt, false), .{
+            right.toFmt(globalObject, &formatter),
+            left.toFmt(globalObject, &formatter),
+        });
+        return .zero;
+    }
+
+    pub fn toHaveLength(
+        this: *Expect,
+        globalObject: *JSC.JSGlobalObject,
+        callframe: *JSC.CallFrame,
+    ) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+        const thisValue = callframe.this();
+        const arguments_ = callframe.arguments(1);
+        const arguments = arguments_.ptr[0..arguments_.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toHaveLength() takes 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const expected: JSValue = arguments[0];
+        const value: JSValue = this.getValue(globalObject, thisValue, "toHaveLength", "<green>expected<r>") orelse return .zero;
+
+        if (!value.isObject() and !value.isString()) {
+            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+            globalObject.throw("Received value does not have a length property: {any}", .{value.toFmt(globalObject, &fmt)});
+            return .zero;
+        }
+
+        if (!expected.isNumber()) {
+            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+            globalObject.throw("Expected value must be a non-negative integer: {any}", .{expected.toFmt(globalObject, &fmt)});
+            return .zero;
+        }
+
+        const expected_length: f64 = expected.asNumber();
+        if (@round(expected_length) != expected_length or std.math.isInf(expected_length) or std.math.isNan(expected_length) or expected_length < 0) {
+            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+            globalObject.throw("Expected value must be a non-negative integer: {any}", .{expected.toFmt(globalObject, &fmt)});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass = false;
+
+        const actual_length = value.getLengthIfPropertyExistsInternal(globalObject);
+
+        if (actual_length == std.math.inf(f64)) {
+            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+            globalObject.throw("Received value does not have a length property: {any}", .{value.toFmt(globalObject, &fmt)});
+            return .zero;
+        } else if (std.math.isNan(actual_length)) {
+            globalObject.throw("Received value has non-number length property: {}", .{actual_length});
+            return .zero;
+        }
+
+        if (actual_length == expected_length) {
+            pass = true;
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        if (not) {
+            const expected_line = "Expected length: not <green>{d}<r>\n";
+            const fmt = comptime getSignature("toHaveLength", "<green>expected<r>", true) ++ "\n\n" ++ expected_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_length});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_length});
+            return .zero;
+        }
+
+        const expected_line = "Expected length: <green>{d}<r>\n";
+        const received_line = "Received length: <red>{d}<r>\n";
+        const fmt = comptime getSignature("toHaveLength", "<green>expected<r>", false) ++ "\n\n" ++
+            expected_line ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_length, actual_length });
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_length, actual_length });
+        return .zero;
+    }
+
+    pub fn toContain(
+        this: *Expect,
+        globalObject: *JSC.JSGlobalObject,
+        callFrame: *JSC.CallFrame,
+    ) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+        const thisValue = callFrame.this();
+        const arguments_ = callFrame.arguments(1);
+        const arguments = arguments_.ptr[0..arguments_.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toContain() takes 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const expected = arguments[0];
+        expected.ensureStillAlive();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toContain", "<green>expected<r>") orelse return .zero;
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (value.isIterable(globalObject)) {
+            var itr = value.arrayIterator(globalObject);
+            while (itr.next()) |item| {
+                if (item.isSameValue(expected, globalObject)) {
+                    pass = true;
+                    break;
+                }
+            }
+        } else if (value.isString() and expected.isString()) {
+            const value_string = value.toString(globalObject).toSlice(globalObject, default_allocator).slice();
+            const expected_string = expected.toString(globalObject).toSlice(globalObject, default_allocator).slice();
+            if (strings.contains(value_string, expected_string)) {
+                pass = true;
+            } else if (value_string.len == 0 and expected_string.len == 0) { // edge case two empty strings are true
+                pass = true;
+            }
+        } else {
+            globalObject.throw("Received value must be an array type, or both received and expected values must be strings.", .{});
+            return .zero;
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        const expected_fmt = expected.toFmt(globalObject, &formatter);
+        if (not) {
+            const expected_line = "Expected to contain: not <green>{any}<r>\n";
+            const fmt = comptime getSignature("toContain", "<green>expected<r>", true) ++ "\n\n" ++ expected_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_fmt});
+            return .zero;
+        }
+
+        const expected_line = "Expected to contain: <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toContain", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toBeTruthy(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeTruthy", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+
+        const truthy = value.toBooleanSlow(globalObject);
+        if (truthy) pass = true;
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeTruthy", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeTruthy", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toBeUndefined(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeUndefined", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+        if (value.isUndefined()) pass = true;
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeUndefined", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeUndefined", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toBeNaN(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeNaN", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+        if (value.isNumber()) {
+            const number = value.asNumber();
+            if (number != number) pass = true;
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeNaN", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeNaN", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toBeNull(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeNull", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = value.isNull();
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeNull", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeNull", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toBeDefined(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeDefined", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = !value.isUndefined();
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeDefined", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeDefined", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toBeFalsy(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeFalsy", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+
+        const truthy = value.toBooleanSlow(globalObject);
+        if (!truthy) pass = true;
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeFalsy", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeFalsy", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toEqual() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const expected = arguments[0];
+        const value: JSValue = this.getValue(globalObject, thisValue, "toEqual", "<green>expected<r>") orelse return .zero;
+
+        const not = this.flags.not;
+        var pass = value.jestDeepEquals(expected, globalObject);
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        const diff_formatter = DiffFormatter{
+            .received = value,
+            .expected = expected,
+            .globalObject = globalObject,
+            .not = not,
+        };
+
+        if (not) {
+            const signature = comptime getSignature("toEqual", "<green>expected<r>", true);
+            const fmt = signature ++ "\n\n{any}\n";
+            globalObject.throwPretty(fmt, .{diff_formatter});
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toEqual", "<green>expected<r>", false);
+        const fmt = signature ++ "\n\n{any}\n";
+        globalObject.throwPretty(fmt, .{diff_formatter});
+        return .zero;
+    }
+
+    pub fn toStrictEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toStrictEqual() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const expected = arguments[0];
+        const value: JSValue = this.getValue(globalObject, thisValue, "toStrictEqual", "<green>expected<r>") orelse return .zero;
+
+        const not = this.flags.not;
+        var pass = value.jestStrictDeepEquals(expected, globalObject);
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        const diff_formatter = DiffFormatter{ .received = value, .expected = expected, .globalObject = globalObject, .not = not };
+
+        if (not) {
+            const signature = comptime getSignature("toStrictEqual", "<green>expected<r>", true);
+            const fmt = signature ++ "\n\n{any}\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{diff_formatter});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{diff_formatter});
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toStrictEqual", "<green>expected<r>", false);
+        const fmt = signature ++ "\n\n{any}\n";
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{diff_formatter});
+            return .zero;
+        }
+        globalObject.throw(Output.prettyFmt(fmt, false), .{diff_formatter});
+        return .zero;
+    }
+
+    pub fn toHaveProperty(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(2);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toHaveProperty() requires at least 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const expected_property_path = arguments[0];
+        expected_property_path.ensureStillAlive();
+        const expected_property: ?JSValue = if (arguments.len > 1) arguments[1] else null;
+        if (expected_property) |ev| ev.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toHaveProperty", "<green>path<r><d>, <r><green>value<r>") orelse return .zero;
+
+        if (!expected_property_path.isString() and !expected_property_path.isIterable(globalObject)) {
+            globalObject.throw("Expected path must be a string or an array", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var path_string = ZigString.Empty;
+        expected_property_path.toZigString(&path_string, globalObject);
+
+        var pass = !value.isUndefinedOrNull();
+        var received_property: JSValue = .zero;
+
+        if (pass) {
+            received_property = value.getIfPropertyExistsFromPath(globalObject, expected_property_path);
+            pass = !received_property.isEmpty();
+        }
+
+        if (pass and expected_property != null) {
+            pass = received_property.jestDeepEquals(expected_property.?, globalObject);
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        if (not) {
+            if (expected_property != null) {
+                const signature = comptime getSignature("toHaveProperty", "<green>path<r><d>, <r><green>value<r>", true);
+                if (!received_property.isEmpty()) {
+                    const fmt = signature ++ "\n\nExpected path: <green>{any}<r>\n\nExpected value: not <green>{any}<r>\n";
+                    if (Output.enable_ansi_colors) {
+                        globalObject.throw(Output.prettyFmt(fmt, true), .{
+                            expected_property_path.toFmt(globalObject, &formatter),
+                            expected_property.?.toFmt(globalObject, &formatter),
+                        });
+                        return .zero;
+                    }
+                    globalObject.throw(Output.prettyFmt(fmt, true), .{
+                        expected_property_path.toFmt(globalObject, &formatter),
+                        expected_property.?.toFmt(globalObject, &formatter),
+                    });
+                    return .zero;
+                }
+            }
+
+            const signature = comptime getSignature("toHaveProperty", "<green>path<r>", true);
+            const fmt = signature ++ "\n\nExpected path: not <green>{any}<r>\n\nReceived value: <red>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{
+                    expected_property_path.toFmt(globalObject, &formatter),
+                    received_property.toFmt(globalObject, &formatter),
+                });
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{
+                expected_property_path.toFmt(globalObject, &formatter),
+                received_property.toFmt(globalObject, &formatter),
+            });
+            return .zero;
+        }
+
+        if (expected_property != null) {
+            const signature = comptime getSignature("toHaveProperty", "<green>path<r><d>, <r><green>value<r>", false);
+            if (!received_property.isEmpty()) {
+                // deep equal case
+                const fmt = signature ++ "\n\n{any}\n";
+                const diff_format = DiffFormatter{
+                    .received = received_property,
+                    .expected = expected_property.?,
+                    .globalObject = globalObject,
+                };
+
+                if (Output.enable_ansi_colors) {
+                    globalObject.throw(Output.prettyFmt(fmt, true), .{diff_format});
+                    return .zero;
+                }
+                globalObject.throw(Output.prettyFmt(fmt, false), .{diff_format});
+                return .zero;
+            }
+
+            const fmt = signature ++ "\n\nExpected path: <green>{any}<r>\n\nExpected value: <green>{any}<r>\n\n" ++
+                "Unable to find property\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{
+                    expected_property_path.toFmt(globalObject, &formatter),
+                    expected_property.?.toFmt(globalObject, &formatter),
+                });
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{
+                expected_property_path.toFmt(globalObject, &formatter),
+                expected_property.?.toFmt(globalObject, &formatter),
+            });
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toHaveProperty", "<green>path<r>", false);
+        const fmt = signature ++ "\n\nExpected path: <green>{any}<r>\n\nUnable to find property\n";
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{expected_property_path.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+        globalObject.throw(Output.prettyFmt(fmt, false), .{expected_property_path.toFmt(globalObject, &formatter)});
+        return .zero;
+    }
+
+    pub fn toBeEven(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeEven", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (value.isAnyInt()) {
+            const _value = value.toInt64();
+            pass = @mod(_value, 2) == 0;
+            if (_value == -0) { // negative zero is even
+                pass = true;
+            }
+        } else if (value.isBigInt() or value.isBigInt32()) {
+            const _value = value.toInt64();
+            pass = switch (_value == -0) { // negative zero is even
+                true => true,
+                else => _value & 1 == 0,
+            };
+        } else if (value.isNumber()) {
+            const _value = JSValue.asNumber(value);
+            if (@mod(_value, 1) == 0 and @mod(_value, 2) == 0) { // if the fraction is all zeros and even
+                pass = true;
+            } else {
+                pass = false;
+            }
+        } else {
+            pass = false;
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeEven", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeEven", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toBeGreaterThan(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBeGreaterThan() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const other_value = arguments[0];
+        other_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeGreaterThan", "<green>expected<r>") orelse return .zero;
+
+        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
+            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (!value.isBigInt() and !other_value.isBigInt()) {
+            pass = value.asNumber() > other_value.asNumber();
+        } else if (value.isBigInt()) {
+            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
+                .greater_than => true,
+                else => pass,
+            };
+        } else {
+            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
+                .less_than => true,
+                else => pass,
+            };
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        const expected_fmt = other_value.toFmt(globalObject, &formatter);
+        if (not) {
+            const expected_line = "Expected: not \\> <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeGreaterThan", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected: \\> <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeGreaterThan", "<green>expected<r>", false) ++ "\n\n" ++
+            expected_line ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toBeGreaterThanOrEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBeGreaterThanOrEqual() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const other_value = arguments[0];
+        other_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeGreaterThanOrEqual", "<green>expected<r>") orelse return .zero;
+
+        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
+            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (!value.isBigInt() and !other_value.isBigInt()) {
+            pass = value.asNumber() >= other_value.asNumber();
+        } else if (value.isBigInt()) {
+            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
+                .greater_than, .equal => true,
+                else => pass,
+            };
+        } else {
+            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
+                .less_than, .equal => true,
+                else => pass,
+            };
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        const expected_fmt = other_value.toFmt(globalObject, &formatter);
+        if (not) {
+            const expected_line = "Expected: not \\>= <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeGreaterThanOrEqual", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected: \\>= <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeGreaterThanOrEqual", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+        return .zero;
+    }
+
+    pub fn toBeLessThan(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBeLessThan() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const other_value = arguments[0];
+        other_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeLessThan", "<green>expected<r>") orelse return .zero;
+
+        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
+            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (!value.isBigInt() and !other_value.isBigInt()) {
+            pass = value.asNumber() < other_value.asNumber();
+        } else if (value.isBigInt()) {
+            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
+                .less_than => true,
+                else => pass,
+            };
+        } else {
+            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
+                .greater_than => true,
+                else => pass,
+            };
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        const expected_fmt = other_value.toFmt(globalObject, &formatter);
+        if (not) {
+            const expected_line = "Expected: not \\< <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeLessThan", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected: \\< <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeLessThan", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+        return .zero;
+    }
+
+    pub fn toBeLessThanOrEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBeLessThanOrEqual() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const other_value = arguments[0];
+        other_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeLessThanOrEqual", "<green>expected<r>") orelse return .zero;
+
+        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
+            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (!value.isBigInt() and !other_value.isBigInt()) {
+            pass = value.asNumber() <= other_value.asNumber();
+        } else if (value.isBigInt()) {
+            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
+                .less_than, .equal => true,
+                else => pass,
+            };
+        } else {
+            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
+                .greater_than, .equal => true,
+                else => pass,
+            };
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        const expected_fmt = other_value.toFmt(globalObject, &formatter);
+        if (not) {
+            const expected_line = "Expected: not \\<= <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeLessThanOrEqual", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected: \\<= <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeLessThanOrEqual", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+        return .zero;
+    }
+
+    pub fn toBeCloseTo(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const thisArguments = callFrame.arguments(2);
+        const arguments = thisArguments.ptr[0..thisArguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBeCloseTo() requires at least 1 argument. Expected value must be a number", .{});
+            return .zero;
+        }
+
+        const expected_ = arguments[0];
+        if (!expected_.isNumber()) {
+            globalObject.throwInvalidArgumentType("toBeCloseTo", "expected", "number");
+            return .zero;
+        }
+
+        var precision: f64 = 2.0;
+        if (arguments.len > 1) {
+            const precision_ = arguments[1];
+            if (!precision_.isNumber()) {
+                globalObject.throwInvalidArgumentType("toBeCloseTo", "precision", "number");
+                return .zero;
+            }
+
+            precision = precision_.asNumber();
+        }
+
+        const received_: JSValue = this.getValue(globalObject, thisValue, "toBeCloseTo", "<green>expected<r>, precision") orelse return .zero;
+        if (!received_.isNumber()) {
+            globalObject.throwInvalidArgumentType("expect", "received", "number");
+            return .zero;
+        }
+
+        var expected = expected_.asNumber();
+        var received = received_.asNumber();
+
+        if (std.math.isNegativeInf(expected)) {
+            expected = -expected;
+        }
+
+        if (std.math.isNegativeInf(received)) {
+            received = -received;
+        }
+
+        if (std.math.isPositiveInf(expected) and std.math.isPositiveInf(received)) {
+            return thisValue;
+        }
+
+        const expected_diff = std.math.pow(f64, 10, -precision) / 2;
+        const actual_diff = std.math.fabs(received - expected);
+        var pass = actual_diff < expected_diff;
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+        const expected_fmt = expected_.toFmt(globalObject, &formatter);
+        const received_fmt = received_.toFmt(globalObject, &formatter);
+
+        const expected_line = "Expected: <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const expected_precision = "Expected precision: {d}\n";
+        const expected_difference = "Expected difference: \\< <green>{d}<r>\n";
+        const received_difference = "Received difference: <red>{d}<r>\n";
+
+        const suffix_fmt = "\n\n" ++ expected_line ++ received_line ++ "\n" ++ expected_precision ++ expected_difference ++ received_difference;
+
+        if (not) {
+            const fmt = comptime getSignature("toBeCloseTo", "<green>expected<r>, precision", true) ++ suffix_fmt;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeCloseTo", "<green>expected<r>, precision", false) ++ suffix_fmt;
+
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
+        return .zero;
+    }
+
+    pub fn toBeOdd(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeOdd", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+
+        if (value.isBigInt32()) {
+            pass = value.toInt32() & 1 == 1;
+        } else if (value.isBigInt()) {
+            pass = value.toInt64() & 1 == 1;
+        } else if (value.isInt32()) {
+            const _value = value.toInt32();
+            pass = @mod(_value, 2) == 1;
+        } else if (value.isAnyInt()) {
+            const _value = value.toInt64();
+            pass = @mod(_value, 2) == 1;
+        } else if (value.isNumber()) {
+            const _value = JSValue.asNumber(value);
+            if (@mod(_value, 1) == 0 and @mod(_value, 2) == 1) { // if the fraction is all zeros and odd
+                pass = true;
+            } else {
+                pass = false;
+            }
+        } else {
+            pass = false;
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeOdd", "", true) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+            return .zero;
+        }
+
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeOdd", "", false) ++ "\n\n" ++ received_line;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
+            return .zero;
+        }
+
+        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
+        return .zero;
+    }
+
+    pub fn toThrow(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        active_test_expectation_counter.actual += 1;
+
+        const expected_value: JSValue = if (arguments.len > 0) brk: {
+            const value = arguments[0];
+            if (value.isEmptyOrUndefinedOrNull() or !value.isObject() and !value.isString()) {
+                var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+                globalObject.throw("Expected value must be string or Error: {any}", .{value.toFmt(globalObject, &fmt)});
+                return .zero;
+            }
+            break :brk value;
+        } else .zero;
+        expected_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toThrow", "<green>expected<r>") orelse return .zero;
+
+        if (!value.jsType().isFunction()) {
+            globalObject.throw("Expected value must be a function", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+
+        const result_: ?JSValue = brk: {
+            var vm = globalObject.bunVM();
+            var return_value: JSValue = .zero;
+            var scope = vm.unhandledRejectionScope();
+            var prev_unhandled_pending_rejection_to_capture = vm.unhandled_pending_rejection_to_capture;
+            vm.unhandled_pending_rejection_to_capture = &return_value;
+            vm.onUnhandledRejection = &VirtualMachine.onQuietUnhandledRejectionHandlerCaptureValue;
+            const return_value_from_fucntion: JSValue = value.call(globalObject, &.{});
+            vm.unhandled_pending_rejection_to_capture = prev_unhandled_pending_rejection_to_capture;
+
+            if (return_value == .zero) {
+                return_value = return_value_from_fucntion;
+            }
+
+            if (return_value.asAnyPromise()) |promise| {
+                globalObject.bunVM().waitForPromise(promise);
+                scope.apply(vm);
+                const promise_result = promise.result(globalObject.vm());
+
+                switch (promise.status(globalObject.vm())) {
+                    .Fulfilled => {
+                        break :brk null;
+                    },
+                    .Rejected => {
+                        // since we know for sure it rejected, we should always return the error
+                        break :brk promise_result.toError() orelse promise_result;
+                    },
+                    .Pending => unreachable,
+                }
+            }
+            scope.apply(vm);
+
+            break :brk return_value.toError();
+        };
+
+        const did_throw = result_ != null;
+
+        if (not) {
+            const signature = comptime getSignature("toThrow", "<green>expected<r>", true);
+
+            if (!did_throw) return thisValue;
+
+            const result: JSValue = result_.?;
+            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+            if (expected_value.isEmpty()) {
+                const signature_no_args = comptime getSignature("toThrow", "", true);
+                if (result.toError()) |err| {
+                    const name = err.get(globalObject, "name") orelse JSValue.undefined;
+                    const message = err.get(globalObject, "message") orelse JSValue.undefined;
+                    const fmt = signature_no_args ++ "\n\nError name: <red>{any}<r>\nError message: <red>{any}<r>\n";
+                    globalObject.throwPretty(fmt, .{
+                        name.toFmt(globalObject, &formatter),
+                        message.toFmt(globalObject, &formatter),
+                    });
+                    return .zero;
+                }
+
+                // non error thrown
+                const fmt = signature_no_args ++ "\n\nThrown value: <red>{any}<r>\n";
+                globalObject.throwPretty(fmt, .{result.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+
+            if (expected_value.isString()) {
+                const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
+
+                // TODO: remove this allocation
+                // partial match
+                {
+                    const expected_slice = expected_value.toSliceOrNull(globalObject) orelse return .zero;
+                    defer expected_slice.deinit();
+                    const received_slice = received_message.toSliceOrNull(globalObject) orelse return .zero;
+                    defer received_slice.deinit();
+                    if (!strings.contains(received_slice.slice(), expected_slice.slice())) return thisValue;
+                }
+
+                const fmt = signature ++ "\n\nExpected substring: not <green>{any}<r>\nReceived message: <red>{any}<r>\n";
+                globalObject.throwPretty(fmt, .{
+                    expected_value.toFmt(globalObject, &formatter),
+                    received_message.toFmt(globalObject, &formatter),
+                });
+                return .zero;
+            }
+
+            if (expected_value.isRegExp()) {
+                const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
+
+                // TODO: REMOVE THIS GETTER! Expose a binding to call .test on the RegExp object directly.
+                if (expected_value.get(globalObject, "test")) |test_fn| {
+                    const matches = test_fn.callWithThis(globalObject, expected_value, &.{received_message});
+                    if (!matches.toBooleanSlow(globalObject)) return thisValue;
+                }
+
+                const fmt = signature ++ "\n\nExpected pattern: not <green>{any}<r>\nReceived message: <red>{any}<r>\n";
+                globalObject.throwPretty(fmt, .{
+                    expected_value.toFmt(globalObject, &formatter),
+                    received_message.toFmt(globalObject, &formatter),
+                });
+                return .zero;
+            }
+
+            if (expected_value.get(globalObject, "message")) |expected_message| {
+                const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
+                // no partial match for this case
+                if (!expected_message.isSameValue(received_message, globalObject)) return thisValue;
+
+                const fmt = signature ++ "\n\nExpected message: not <green>{any}<r>\n";
+                globalObject.throwPretty(fmt, .{expected_message.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+
+            if (!result.isInstanceOf(globalObject, expected_value)) return thisValue;
+
+            var expected_class = ZigString.Empty;
+            expected_value.getClassName(globalObject, &expected_class);
+            const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
+            const fmt = signature ++ "\n\nExpected constructor: not <green>{s}<r>\n\nReceived message: <red>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_class, received_message.toFmt(globalObject, &formatter) });
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_class, received_message.toFmt(globalObject, &formatter) });
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toThrow", "<green>expected<r>", false);
+        if (did_throw) {
+            if (expected_value.isEmpty()) return thisValue;
+
+            const result: JSValue = if (result_.?.toError()) |r|
+                r
+            else
+                result_.?;
+
+            const _received_message: ?JSValue = if (result.isObject())
+                result.get(globalObject, "message")
+            else if (result.toStringOrNull(globalObject)) |js_str|
+                JSC.JSValue.fromCell(js_str)
+            else
+                null;
+
+            if (expected_value.isString()) {
+                if (_received_message) |received_message| {
+                    // TODO: remove this allocation
+                    // partial match
+                    const expected_slice = expected_value.toSliceOrNull(globalObject) orelse return .zero;
+                    defer expected_slice.deinit();
+                    const received_slice = received_message.toSlice(globalObject, globalObject.allocator());
+                    defer received_slice.deinit();
+                    if (strings.contains(received_slice.slice(), expected_slice.slice())) return thisValue;
+                }
+
+                // error: message from received error does not match expected string
+                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+                if (_received_message) |received_message| {
+                    const expected_value_fmt = expected_value.toFmt(globalObject, &formatter);
+                    const received_message_fmt = received_message.toFmt(globalObject, &formatter);
+                    const fmt = signature ++ "\n\n" ++ "Expected substring: <green>{any}<r>\nReceived message: <red>{any}<r>\n";
+                    globalObject.throwPretty(fmt, .{ expected_value_fmt, received_message_fmt });
+                    return .zero;
+                }
+
+                const expected_fmt = expected_value.toFmt(globalObject, &formatter);
+                const received_fmt = result.toFmt(globalObject, &formatter);
+                const fmt = signature ++ "\n\n" ++ "Expected substring: <green>{any}<r>\nReceived value: <red>{any}<r>";
+                globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
+
+                return .zero;
+            }
+
+            if (expected_value.isRegExp()) {
+                if (_received_message) |received_message| {
+                    // TODO: REMOVE THIS GETTER! Expose a binding to call .test on the RegExp object directly.
+                    if (expected_value.get(globalObject, "test")) |test_fn| {
+                        const matches = test_fn.callWithThis(globalObject, expected_value, &.{received_message});
+                        if (matches.toBooleanSlow(globalObject)) return thisValue;
+                    }
+                }
+
+                // error: message from received error does not match expected pattern
+                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+                if (_received_message) |received_message| {
+                    const expected_value_fmt = expected_value.toFmt(globalObject, &formatter);
+                    const received_message_fmt = received_message.toFmt(globalObject, &formatter);
+                    const fmt = signature ++ "\n\n" ++ "Expected pattern: <green>{any}<r>\nReceived message: <red>{any}<r>\n";
+                    globalObject.throwPretty(fmt, .{ expected_value_fmt, received_message_fmt });
+
+                    return .zero;
+                }
+
+                const expected_fmt = expected_value.toFmt(globalObject, &formatter);
+                const received_fmt = result.toFmt(globalObject, &formatter);
+                const fmt = signature ++ "\n\n" ++ "Expected pattern: <green>{any}<r>\nReceived value: <red>{any}<r>";
+                globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
+                return .zero;
+            }
+
+            // If it's not an object, we are going to crash here.
+            std.debug.assert(expected_value.isObject());
+
+            if (expected_value.get(globalObject, "message")) |expected_message| {
+                if (_received_message) |received_message| {
+                    if (received_message.isSameValue(expected_message, globalObject)) return thisValue;
+                }
+
+                // error: message from received error does not match expected error message.
+                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+                if (_received_message) |received_message| {
+                    const expected_fmt = expected_message.toFmt(globalObject, &formatter);
+                    const received_fmt = received_message.toFmt(globalObject, &formatter);
+                    const fmt = signature ++ "\n\nExpected message: <green>{any}<r>\nReceived message: <red>{any}<r>\n";
+                    globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
+                    return .zero;
+                }
+
+                const expected_fmt = expected_message.toFmt(globalObject, &formatter);
+                const received_fmt = result.toFmt(globalObject, &formatter);
+                const fmt = signature ++ "\n\nExpected message: <green>{any}<r>\nReceived value: <red>{any}<r>\n";
+                globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
+                return .zero;
+            }
+
+            if (result.isInstanceOf(globalObject, expected_value)) return thisValue;
+
+            // error: received error not instance of received error constructor
+            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+            var expected_class = ZigString.Empty;
+            var received_class = ZigString.Empty;
+            expected_value.getClassName(globalObject, &expected_class);
+            result.getClassName(globalObject, &received_class);
+            const fmt = signature ++ "\n\nExpected constructor: <green>{s}<r>\nReceived constructor: <red>{s}<r>\n\n";
+
+            if (_received_message) |received_message| {
+                const message_fmt = fmt ++ "Received message: <red>{any}<r>\n";
+                const received_message_fmt = received_message.toFmt(globalObject, &formatter);
+
+                globalObject.throwPretty(message_fmt, .{
+                    expected_class,
+                    received_class,
+                    received_message_fmt,
+                });
+                return .zero;
+            }
+
+            const received_fmt = result.toFmt(globalObject, &formatter);
+            const value_fmt = fmt ++ "Received value: <red>{any}<r>\n";
+
+            globalObject.throwPretty(value_fmt, .{
+                expected_class,
+                received_class,
+                received_fmt,
+            });
+            return .zero;
+        }
+
+        // did not throw
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        const received_line = "Received function did not throw\n";
+
+        if (expected_value.isEmpty()) {
+            const fmt = comptime getSignature("toThrow", "", false) ++ "\n\n" ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{});
+            return .zero;
+        }
+
+        if (expected_value.isString()) {
+            const expected_fmt = "\n\nExpected substring: <green>{any}<r>\n\n" ++ received_line;
+            const fmt = signature ++ expected_fmt;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_value.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        if (expected_value.isRegExp()) {
+            const expected_fmt = "\n\nExpected pattern: <green>{any}<r>\n\n" ++ received_line;
+            const fmt = signature ++ expected_fmt;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_value.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        if (expected_value.get(globalObject, "message")) |expected_message| {
+            const expected_fmt = "\n\nExpected message: <green>{any}<r>\n\n" ++ received_line;
+            const fmt = signature ++ expected_fmt;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_message.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_message.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        const expected_fmt = "\n\nExpected constructor: <green>{s}<r>\n\n" ++ received_line;
+        var expected_class = ZigString.Empty;
+        expected_value.getClassName(globalObject, &expected_class);
+        const fmt = signature ++ expected_fmt;
+        if (Output.enable_ansi_colors) {
+            globalObject.throw(Output.prettyFmt(fmt, true), .{expected_class});
+            return .zero;
+        }
+        globalObject.throw(Output.prettyFmt(fmt, true), .{expected_class});
+        return .zero;
+    }
+
+    pub fn toMatchSnapshot(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(2);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        if (not) {
+            const signature = comptime getSignature("toMatchSnapshot", "", true);
+            const fmt = signature ++ "\n\n<b>Matcher error<r>: Snapshot matchers cannot be used with <b>not<r>\n";
+            globalObject.throwPretty(fmt, .{});
+        }
+
+        var hint_string: ZigString = ZigString.Empty;
+        var property_matchers: ?JSValue = null;
+        switch (arguments.len) {
+            0 => {},
+            1 => {
+                if (arguments[0].isString()) {
+                    arguments[0].toZigString(&hint_string, globalObject);
+                } else if (arguments[0].isObject()) {
+                    property_matchers = arguments[0];
+                }
+            },
+            else => {
+                if (!arguments[0].isObject()) {
+                    const signature = comptime getSignature("toMatchSnapshot", "<green>properties<r><d>, <r>hint", false);
+                    const fmt = signature ++ "\n\nMatcher error: Expected <green>properties<r> must be an object\n";
+                    globalObject.throwPretty(fmt, .{});
+                    return .zero;
+                }
+
+                property_matchers = arguments[0];
+
+                if (arguments[1].isString()) {
+                    arguments[1].toZigString(&hint_string, globalObject);
+                }
+            },
+        }
+
+        var hint = hint_string.toSlice(default_allocator);
+        defer hint.deinit();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toMatchSnapshot", "<green>properties<r><d>, <r>hint") orelse return .zero;
+
+        if (!value.isObject() and property_matchers != null) {
+            const signature = comptime getSignature("toMatchSnapshot", "<green>properties<r><d>, <r>hint", false);
+            const fmt = signature ++ "\n\n<b>Matcher error: <red>received<r> values must be an object when the matcher has <green>properties<r>\n";
+            globalObject.throwPretty(fmt, .{});
+            return .zero;
+        }
+
+        if (property_matchers) |_prop_matchers| {
+            var prop_matchers = _prop_matchers;
+
+            if (!value.jestDeepMatch(prop_matchers, globalObject, true)) {
+                // TODO: print diff with properties from propertyMatchers
+                const signature = comptime getSignature("toMatchSnapshot", "<green>propertyMatchers<r>", false);
+                const fmt = signature ++ "\n\nExpected <green>propertyMatchers<r> to match properties from received object" ++
+                    "\n\nReceived: {any}\n";
+
+                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject };
+                globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+        }
+
+        const result = Jest.runner.?.snapshots.getOrPut(this, value, hint.slice(), globalObject) catch |err| {
+            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject };
+            const test_file_path = Jest.runner.?.files.get(this.scope.file_id).source.path.text;
+            switch (err) {
+                error.FailedToOpenSnapshotFile => globalObject.throw("Failed to open snapshot file for test file: {s}", .{test_file_path}),
+                error.FailedToMakeSnapshotDirectory => globalObject.throw("Failed to make snapshot directory for test file: {s}", .{test_file_path}),
+                error.FailedToWriteSnapshotFile => globalObject.throw("Failed write to snapshot file: {s}", .{test_file_path}),
+                error.ParseError => globalObject.throw("Failed to parse snapshot file for: {s}", .{test_file_path}),
+                else => globalObject.throw("Failed to snapshot value: {any}", .{value.toFmt(globalObject, &formatter)}),
+            }
+            return .zero;
+        };
+
+        if (result) |saved_value| {
+            var pretty_value: MutableString = MutableString.init(default_allocator, 0) catch unreachable;
+            value.jestSnapshotPrettyFormat(&pretty_value, globalObject) catch {
+                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject };
+                globalObject.throw("Failed to pretty format value: {s}", .{value.toFmt(globalObject, &formatter)});
+                return .zero;
+            };
+            defer pretty_value.deinit();
+
+            if (strings.eqlLong(pretty_value.toOwnedSliceLeaky(), saved_value, true)) {
+                Jest.runner.?.snapshots.passed += 1;
+                return thisValue;
+            }
+
+            Jest.runner.?.snapshots.failed += 1;
+            const signature = comptime getSignature("toMatchSnapshot", "<green>expected<r>", false);
+            const fmt = signature ++ "\n\n{any}\n";
+            const diff_format = DiffFormatter{
+                .received_string = pretty_value.toOwnedSliceLeaky(),
+                .expected_string = saved_value,
+                .globalObject = globalObject,
+            };
+
+            globalObject.throwPretty(fmt, .{diff_format});
+            return .zero;
+        }
+
+        return thisValue;
+    }
+
+    pub fn toBeEmpty(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeEmpty", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = false;
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+        const actual_length = value.getLengthIfPropertyExistsInternal(globalObject);
+
+        if (actual_length == std.math.inf(f64)) {
+            if (value.jsTypeLoose().isObject()) {
+                if (value.isIterable(globalObject)) {
+                    var any_properties_in_iterator = false;
+                    value.forEach(globalObject, &any_properties_in_iterator, struct {
+                        pub fn anythingInIterator(
+                            _: *JSC.VM,
+                            _: *JSGlobalObject,
+                            any_: ?*anyopaque,
+                            _: JSValue,
+                        ) callconv(.C) void {
+                            bun.cast(*bool, any_.?).* = true;
+                        }
+                    }.anythingInIterator);
+                    pass = !any_properties_in_iterator;
+                } else {
+                    var props_iter = JSC.JSPropertyIterator(.{
+                        .skip_empty_name = false,
+
+                        .include_value = true,
+                    }).init(globalObject, value.asObjectRef());
+                    defer props_iter.deinit();
+                    pass = props_iter.len == 0;
+                }
+            } else {
+                const signature = comptime getSignature("toBeEmpty", "", false);
+                const fmt = signature ++ "\n\nExpected value to be a string, object, or iterable" ++
+                    "\n\nReceived: <red>{any}<r>\n";
+                globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+        } else if (std.math.isNan(actual_length)) {
+            globalObject.throw("Received value has non-number length property: {}", .{actual_length});
+            return .zero;
+        } else {
+            pass = actual_length == 0;
+        }
+
+        if (not and pass) {
+            const signature = comptime getSignature("toBeEmpty", "", true);
+            const fmt = signature ++ "\n\nExpected value <b>not<r> to be a string, object, or iterable" ++
+                "\n\nReceived: <red>{any}<r>\n";
+            globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        if (not) {
+            const signature = comptime getSignature("toBeEmpty", "", true);
+            const fmt = signature ++ "\n\nExpected value <b>not<r> to be empty" ++
+                "\n\nReceived: <red>{any}<r>\n";
+            globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toBeEmpty", "", false);
+        const fmt = signature ++ "\n\nExpected value to be empty" ++
+            "\n\nReceived: <red>{any}<r>\n";
+        globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
+        return .zero;
+    }
+
+    pub fn toBeNil(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeNil", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isUndefinedOrNull() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeNil", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeNil", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeArray(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeArray", "") orelse return .zero;
+
+        if (this.scope.tests.items.len <= this.test_id) {
+            globalThis.throw("toBeArray() must be called in a test", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.jsType().isArray() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeArray", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeArray", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeArrayOfSize(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalThis.throwInvalidArguments("toBeArrayOfSize() requires 1 argument", .{});
+            return .zero;
+        }
+
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeArrayOfSize", "") orelse return .zero;
+
+        if (this.scope.tests.items.len <= this.test_id) {
+            globalThis.throw("toBeArrayOfSize() must be called in a test", .{});
+            return .zero;
+        }
+
+        const size = arguments[0];
+        size.ensureStillAlive();
+
+        if (!size.isAnyInt()) {
+            globalThis.throw("toBeArrayOfSize() requires the first argument to be a number", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        var pass = value.jsType().isArray() and @intCast(i32, value.getLength(globalThis)) == size.toInt32();
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeArrayOfSize", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeArrayOfSize", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeBoolean(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeBoolean", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isBoolean() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeBoolean", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeBoolean", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeTypeOf(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalThis.throwInvalidArguments("toBeTypeOf() requires 1 argument", .{});
+            return .zero;
+        }
+
+        if (this.scope.tests.items.len <= this.test_id) {
+            globalThis.throw("toBeTypeOf() must be called in a test", .{});
+            return .zero;
+        }
+
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeTypeOf", "") orelse return .zero;
+
+        const expected = arguments[0];
+        expected.ensureStillAlive();
+
+        const expectedAsStr = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+        active_test_expectation_counter.actual += 1;
+
+        if (!expected.isString()) {
+            globalThis.throwInvalidArguments("toBeTypeOf() requires a string argument", .{});
+            return .zero;
+        }
+
+        if (!std.mem.eql(u8, expectedAsStr, "function") and
+            !std.mem.eql(u8, expectedAsStr, "object") and
+            !std.mem.eql(u8, expectedAsStr, "bigint") and
+            !std.mem.eql(u8, expectedAsStr, "boolean") and
+            !std.mem.eql(u8, expectedAsStr, "number") and
+            !std.mem.eql(u8, expectedAsStr, "string") and
+            !std.mem.eql(u8, expectedAsStr, "symbol") and
+            !std.mem.eql(u8, expectedAsStr, "undefined"))
+        {
+            globalThis.throwInvalidArguments("toBeTypeOf() requires a valid type string argument ('function', 'object', 'bigint', 'boolean', 'number', 'string', 'symbol', 'undefined')", .{});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass = false;
+        var whatIsTheType: []const u8 = "";
+
+        // Checking for function/class should be done before everything else, or it will fail.
+        if (value.isCallable(globalThis.vm())) {
+            whatIsTheType = "function";
+        } else if (value.isObject() or value.jsType().isArray() or value.isNull()) {
+            whatIsTheType = "object";
+        } else if (value.isBigInt()) {
+            whatIsTheType = "bigint";
+        } else if (value.isBoolean()) {
+            whatIsTheType = "boolean";
+        } else if (value.isNumber()) {
+            whatIsTheType = "number";
+        } else if (value.jsType().isString()) {
+            whatIsTheType = "string";
+        } else if (value.isSymbol()) {
+            whatIsTheType = "symbol";
+        } else if (value.isUndefined()) {
+            whatIsTheType = "undefined";
+        } else {
+            globalThis.throw("Internal consistency error: unknown JSValue type", .{});
+            return .zero;
+        }
+
+        pass = std.mem.eql(u8, expectedAsStr, whatIsTheType);
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+        const expected_str = expected.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeTypeOf", "", true) ++ "\n\n" ++ "Expected type: not <green>{any}<r>\n" ++ "Received type: <red>\"{s}\"<r>\nReceived value: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{ expected_str, whatIsTheType, received });
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeTypeOf", "", false) ++ "\n\n" ++ "Expected type: <green>{any}<r>\n" ++ "Received type: <red>\"{s}\"<r>\nReceived value: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{ expected_str, whatIsTheType, received });
+        return .zero;
+    }
+
+    pub fn toBeTrue(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeTrue", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = (value.isBoolean() and value.toBoolean()) != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeTrue", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeTrue", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeFalse(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeFalse", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = (value.isBoolean() and !value.toBoolean()) != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeFalse", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeFalse", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeNumber(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeNumber", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isNumber() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeNumber", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeNumber", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeInteger(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeInteger", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isAnyInt() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeInteger", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeInteger", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeFinite(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeFinite", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isNumber();
+        if (pass) {
+            const num: f64 = value.asNumber();
+            pass = std.math.isFinite(num) and !std.math.isNan(num);
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeFinite", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeFinite", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBePositive(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBePositive", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isNumber();
+        if (pass) {
+            const num: f64 = value.asNumber();
+            pass = @round(num) > 0 and !std.math.isInf(num) and !std.math.isNan(num);
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBePositive", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBePositive", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeNegative(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeNegative", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isNumber();
+        if (pass) {
+            const num: f64 = value.asNumber();
+            pass = @round(num) < 0 and !std.math.isInf(num) and !std.math.isNan(num);
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeNegative", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeNegative", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeWithin(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(2);
+        const arguments = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalThis.throwInvalidArguments("toBeWithin() requires 2 arguments", .{});
+            return .zero;
+        }
+
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeWithin", "<green>start<r><d>, <r><green>end<r>") orelse return .zero;
+
+        const startValue = arguments[0];
+        startValue.ensureStillAlive();
+
+        if (!startValue.isNumber()) {
+            globalThis.throw("toBeWithin() requires the first argument to be a number", .{});
+            return .zero;
+        }
+
+        const endValue = arguments[1];
+        endValue.ensureStillAlive();
+
+        if (!endValue.isNumber()) {
+            globalThis.throw("toBeWithin() requires the second argument to be a number", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isNumber();
+        if (pass) {
+            const num = value.asNumber();
+            pass = num >= startValue.asNumber() and num < endValue.asNumber();
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const start_fmt = startValue.toFmt(globalThis, &formatter);
+        const end_fmt = endValue.toFmt(globalThis, &formatter);
+        const received_fmt = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const expected_line = "Expected: not between <green>{any}<r> <d>(inclusive)<r> and <green>{any}<r> <d>(exclusive)<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeWithin", "<green>start<r><d>, <r><green>end<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            globalThis.throwPretty(fmt, .{ start_fmt, end_fmt, received_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected: between <green>{any}<r> <d>(inclusive)<r> and <green>{any}<r> <d>(exclusive)<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeWithin", "<green>start<r><d>, <r><green>end<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        globalThis.throwPretty(fmt, .{ start_fmt, end_fmt, received_fmt });
+        return .zero;
+    }
+
+    pub fn toBeSymbol(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeSymbol", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isSymbol() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeSymbol", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeSymbol", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeFunction(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeFunction", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isCallable(globalThis.vm()) != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeFunction", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeFunction", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeDate(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeDate", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isDate() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeDate", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeDate", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toBeString(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const value: JSValue = this.getValue(globalThis, thisValue, "toBeString", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+        const pass = value.isString() != not;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const received = value.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const fmt = comptime getSignature("toBeString", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+            globalThis.throwPretty(fmt, .{received});
+            return .zero;
+        }
+
+        const fmt = comptime getSignature("toBeString", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
+        globalThis.throwPretty(fmt, .{received});
+        return .zero;
+    }
+
+    pub fn toInclude(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const arguments_ = callFrame.arguments(1);
+        const arguments = arguments_.ptr[0..arguments_.len];
+
+        if (arguments.len < 1) {
+            globalThis.throwInvalidArguments("toInclude() requires 1 argument", .{});
+            return .zero;
+        }
+
+        const expected = arguments[0];
+        expected.ensureStillAlive();
+
+        if (!expected.isString()) {
+            globalThis.throw("toInclude() requires the first argument to be a string", .{});
+            return .zero;
+        }
+
+        const value: JSValue = this.getValue(globalThis, thisValue, "toInclude", "") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isString();
+        if (pass) {
+            const value_string = value.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+            const expected_string = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+            pass = strings.contains(value_string, expected_string) or expected_string.len == 0;
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const value_fmt = value.toFmt(globalThis, &formatter);
+        const expected_fmt = expected.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const expected_line = "Expected to not include: <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toInclude", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected to include: <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toInclude", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toStartWith(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const arguments_ = callFrame.arguments(1);
+        const arguments = arguments_.ptr[0..arguments_.len];
+
+        if (arguments.len < 1) {
+            globalThis.throwInvalidArguments("toStartWith() requires 1 argument", .{});
+            return .zero;
+        }
+
+        const expected = arguments[0];
+        expected.ensureStillAlive();
+
+        if (!expected.isString()) {
+            globalThis.throw("toStartWith() requires the first argument to be a string", .{});
+            return .zero;
+        }
+
+        const value: JSValue = this.getValue(globalThis, thisValue, "toStartWith", "<green>expected<r>") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isString();
+        if (pass) {
+            const value_string = value.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+            const expected_string = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+            pass = strings.startsWith(value_string, expected_string) or expected_string.len == 0;
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const value_fmt = value.toFmt(globalThis, &formatter);
+        const expected_fmt = expected.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const expected_line = "Expected to not start with: <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toStartWith", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected to start with: <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toStartWith", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toEndWith(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalThis);
+
+        const thisValue = callFrame.this();
+        const arguments_ = callFrame.arguments(1);
+        const arguments = arguments_.ptr[0..arguments_.len];
+
+        if (arguments.len < 1) {
+            globalThis.throwInvalidArguments("toEndWith() requires 1 argument", .{});
+            return .zero;
+        }
+
+        const expected = arguments[0];
+        expected.ensureStillAlive();
+
+        if (!expected.isString()) {
+            globalThis.throw("toEndWith() requires the first argument to be a string", .{});
+            return .zero;
+        }
+
+        const value: JSValue = this.getValue(globalThis, thisValue, "toEndWith", "<green>expected<r>") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        var pass = value.isString();
+        if (pass) {
+            const value_string = value.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+            const expected_string = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
+            pass = strings.endsWith(value_string, expected_string) or expected_string.len == 0;
+        }
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+
+        if (pass) return thisValue;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
+        const value_fmt = value.toFmt(globalThis, &formatter);
+        const expected_fmt = expected.toFmt(globalThis, &formatter);
+
+        if (not) {
+            const expected_line = "Expected to not end with: <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toEndWith", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected to end with: <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toEndWith", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toBeInstanceOf(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toBeInstanceOf() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+        const expected_value = arguments[0];
+        if (!expected_value.isConstructor()) {
+            globalObject.throw("Expected value must be a function: {any}", .{expected_value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+        expected_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toBeInstanceOf", "<green>expected<r>") orelse return .zero;
+
+        const not = this.flags.not;
+        var pass = value.isInstanceOf(globalObject, expected_value);
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        const expected_fmt = expected_value.toFmt(globalObject, &formatter);
+        const value_fmt = value.toFmt(globalObject, &formatter);
+        if (not) {
+            const expected_line = "Expected constructor: not <green>{any}<r>\n";
+            const received_line = "Received value: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toBeInstanceOf", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
+                return .zero;
+            }
+
+            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected constructor: <green>{any}<r>\n";
+        const received_line = "Received value: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toBeInstanceOf", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        globalObject.throwPretty(fmt, .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toMatch(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        JSC.markBinding(@src());
+
+        defer this.postMatch(globalObject);
+
+        const thisValue = callFrame.this();
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len < 1) {
+            globalObject.throwInvalidArguments("toMatch() requires 1 argument", .{});
+            return .zero;
+        }
+
+        active_test_expectation_counter.actual += 1;
+
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+
+        const expected_value = arguments[0];
+        if (!expected_value.isString() and !expected_value.isRegExp()) {
+            globalObject.throw("Expected value must be a string or regular expression: {any}", .{expected_value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+        expected_value.ensureStillAlive();
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toMatch", "<green>expected<r>") orelse return .zero;
+
+        if (!value.isString()) {
+            globalObject.throw("Received value must be a string: {any}", .{value.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        const not = this.flags.not;
+        var pass: bool = brk: {
+            if (expected_value.isString()) {
+                break :brk value.stringIncludes(globalObject, expected_value);
+            } else if (expected_value.isRegExp()) {
+                break :brk expected_value.toMatch(globalObject, value);
+            }
+            unreachable;
+        };
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        const expected_fmt = expected_value.toFmt(globalObject, &formatter);
+        const value_fmt = value.toFmt(globalObject, &formatter);
+
+        if (not) {
+            const expected_line = "Expected substring or pattern: not <green>{any}<r>\n";
+            const received_line = "Received: <red>{any}<r>\n";
+            const fmt = comptime getSignature("toMatch", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
+            globalObject.throwPretty(fmt, .{ expected_fmt, value_fmt });
+            return .zero;
+        }
+
+        const expected_line = "Expected substring or pattern: <green>{any}<r>\n";
+        const received_line = "Received: <red>{any}<r>\n";
+        const fmt = comptime getSignature("toMatch", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
+        globalObject.throwPretty(fmt, .{ expected_fmt, value_fmt });
+        return .zero;
+    }
+
+    pub fn toHaveBeenCalled(this: *Expect, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        JSC.markBinding(@src());
+        const thisValue = callframe.this();
+        defer this.postMatch(globalObject);
+
+        const value: JSValue = this.getValue(globalObject, thisValue, "toHaveBeenCalled", "") orelse return .zero;
+
+        const calls = JSMockFunction__getCalls(value);
+        active_test_expectation_counter.actual += 1;
+
+        if (calls == .zero or !calls.jsType().isArray()) {
+            globalObject.throw("Expected value must be a mock function: {}", .{value});
+            return .zero;
+        }
+
+        var pass = calls.getLength(globalObject) > 0;
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        if (not) {
+            const signature = comptime getSignature("toHaveBeenCalled", "", true);
+            const fmt = signature ++ "\n\nExpected: not <green>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
+            return .zero;
+        } else {
+            const signature = comptime getSignature("toHaveBeenCalled", "", true);
+            const fmt = signature ++ "\n\nExpected <green>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        unreachable;
+    }
+    pub fn toHaveBeenCalledTimes(this: *Expect, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        JSC.markBinding(@src());
+
+        const thisValue = callframe.this();
+        const arguments_ = callframe.arguments(1);
+        const arguments: []const JSValue = arguments_.ptr[0..arguments_.len];
+        defer this.postMatch(globalObject);
+        const value: JSValue = this.getValue(globalObject, thisValue, "toHaveBeenCalledTimes", "<green>expected<r>") orelse return .zero;
+
+        active_test_expectation_counter.actual += 1;
+
+        const calls = JSMockFunction__getCalls(value);
+
+        if (calls == .zero or !calls.jsType().isArray()) {
+            globalObject.throw("Expected value must be a mock function: {}", .{value});
+            return .zero;
+        }
+
+        if (arguments.len < 1 or !arguments[0].isAnyInt()) {
+            globalObject.throwInvalidArguments("toHaveBeenCalledTimes() requires 1 integer argument", .{});
+            return .zero;
+        }
+
+        const times = arguments[0].coerce(i32, globalObject);
+
+        var pass = @intCast(i32, calls.getLength(globalObject)) == times;
+
+        const not = this.flags.not;
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
+        if (not) {
+            const signature = comptime getSignature("toHaveBeenCalledTimes", "<green>expected<r>", true);
+            const fmt = signature ++ "\n\nExpected: not <green>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
+            return .zero;
+        } else {
+            const signature = comptime getSignature("toHaveBeenCalledTimes", "<green>expected<r>", true);
+            const fmt = signature ++ "\n\nExpected <green>{any}<r>\n";
+            if (Output.enable_ansi_colors) {
+                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
+                return .zero;
+            }
+            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
+            return .zero;
+        }
+
+        unreachable;
+    }
+
+    pub fn toMatchObject(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        JSC.markBinding(@src());
+
+        defer this.postMatch(globalObject);
+        const thisValue = callFrame.this();
+        const args = callFrame.arguments(1).slice();
+
+        active_test_expectation_counter.actual += 1;
+
+        const not = this.flags.not;
+
+        const received_object: JSValue = this.getValue(globalObject, thisValue, "toMatchObject", "<green>expected<r>") orelse return .zero;
+
+        if (!received_object.isObject()) {
+            const matcher_error = "\n\n<b>Matcher error<r>: <red>received<r> value must be a non-null object\n";
+            if (not) {
+                const fmt = comptime getSignature("toMatchObject", "<green>expected<r>", true) ++ matcher_error;
+                globalObject.throwPretty(fmt, .{});
+                return .zero;
+            }
+
+            const fmt = comptime getSignature("toMatchObject", "<green>expected<r>", false) ++ matcher_error;
+            globalObject.throwPretty(fmt, .{});
+            return .zero;
+        }
+
+        if (args.len < 1 or !args[0].isObject()) {
+            const matcher_error = "\n\n<b>Matcher error<r>: <green>expected<r> value must be a non-null object\n";
+            if (not) {
+                const fmt = comptime getSignature("toMatchObject", "", true) ++ matcher_error;
+                globalObject.throwPretty(fmt, .{});
+                return .zero;
+            }
+            const fmt = comptime getSignature("toMatchObject", "", false) ++ matcher_error;
+            globalObject.throwPretty(fmt, .{});
+            return .zero;
+        }
+
+        const property_matchers = args[0];
+
+        var pass = received_object.jestDeepMatch(property_matchers, globalObject, true);
+
+        if (not) pass = !pass;
+        if (pass) return thisValue;
+
+        // handle failure
+        const diff_formatter = DiffFormatter{
+            .received = received_object,
+            .expected = property_matchers,
+            .globalObject = globalObject,
+            .not = not,
+        };
+
+        if (not) {
+            const signature = comptime getSignature("toMatchObject", "<green>expected<r>", true);
+            const fmt = signature ++ "\n\n{any}\n";
+            globalObject.throwPretty(fmt, .{diff_formatter});
+            return .zero;
+        }
+
+        const signature = comptime getSignature("toMatchObject", "<green>expected<r>", false);
+        const fmt = signature ++ "\n\n{any}\n";
+        globalObject.throwPretty(fmt, .{diff_formatter});
+        return .zero;
+    }
+
+    pub const toHaveBeenCalledWith = notImplementedJSCFn;
+    pub const toHaveBeenLastCalledWith = notImplementedJSCFn;
+    pub const toHaveBeenNthCalledWith = notImplementedJSCFn;
+    pub const toHaveReturnedTimes = notImplementedJSCFn;
+    pub const toHaveReturnedWith = notImplementedJSCFn;
+    pub const toHaveLastReturnedWith = notImplementedJSCFn;
+    pub const toHaveNthReturnedWith = notImplementedJSCFn;
+    pub const toContainEqual = notImplementedJSCFn;
+    pub const toMatchInlineSnapshot = notImplementedJSCFn;
+    pub const toThrowErrorMatchingSnapshot = notImplementedJSCFn;
+    pub const toThrowErrorMatchingInlineSnapshot = notImplementedJSCFn;
+
+    pub const getStaticNot = notImplementedStaticProp;
+    pub const getStaticResolves = notImplementedStaticProp;
+    pub const getStaticRejects = notImplementedStaticProp;
+
+    pub fn any(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        return ExpectAny.call(globalObject, callFrame);
+    }
+
+    pub fn anything(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        return ExpectAnything.call(globalObject, callFrame);
+    }
+
+    pub fn stringContaining(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        return ExpectStringContaining.call(globalObject, callFrame);
+    }
+
+    pub fn stringMatching(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        return ExpectStringMatching.call(globalObject, callFrame);
+    }
+
+    pub const extend = notImplementedStaticFn;
+    pub const arrayContaining = notImplementedStaticFn;
+    pub const assertions = notImplementedStaticFn;
+    pub const hasAssertions = notImplementedStaticFn;
+    pub const objectContaining = notImplementedStaticFn;
+    pub const addSnapshotSerializer = notImplementedStaticFn;
+
+    pub fn notImplementedJSCFn(_: *Expect, globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        globalObject.throw("Not implemented", .{});
+        return .zero;
+    }
+
+    pub fn notImplementedStaticFn(globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        globalObject.throw("Not implemented", .{});
+        return .zero;
+    }
+
+    pub fn notImplementedJSCProp(_: *Expect, _: JSC.JSValue, globalObject: *JSC.JSGlobalObject) callconv(.C) JSC.JSValue {
+        globalObject.throw("Not implemented", .{});
+        return .zero;
+    }
+
+    pub fn notImplementedStaticProp(globalObject: *JSC.JSGlobalObject, _: JSC.JSValue, _: JSC.JSValue) callconv(.C) JSC.JSValue {
+        globalObject.throw("Not implemented", .{});
+        return .zero;
+    }
+
+    pub fn postMatch(_: *Expect, globalObject: *JSC.JSGlobalObject) void {
+        var vm = globalObject.bunVM();
+        vm.autoGarbageCollect();
+    }
+};
+
+pub const ExpectAnything = struct {
+    pub usingnamespace JSC.Codegen.JSExpectAnything;
+
+    pub fn finalize(
+        this: *ExpectAnything,
+    ) callconv(.C) void {
+        VirtualMachine.get().allocator.destroy(this);
+    }
+
+    pub fn call(globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSValue {
+        const anything = globalObject.bunVM().allocator.create(ExpectAnything) catch unreachable;
+        if (Jest.runner.?.pending_test == null) {
+            const err = globalObject.createErrorInstance("expect.anything() must be called in a test", .{});
+            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
+            globalObject.throwValue(err);
+            return .zero;
+        }
+
+        const anything_js_value = anything.toJS(globalObject);
+        anything_js_value.ensureStillAlive();
+
+        var vm = globalObject.bunVM();
+        vm.autoGarbageCollect();
+
+        return anything_js_value;
+    }
+};
+
+pub const ExpectStringMatching = struct {
+    pub usingnamespace JSC.Codegen.JSExpectStringMatching;
+
+    pub fn finalize(
+        this: *ExpectStringMatching,
+    ) callconv(.C) void {
+        VirtualMachine.get().allocator.destroy(this);
+    }
+
+    pub fn call(globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        const args = callFrame.arguments(1).slice();
+
+        if (args.len == 0 or (!args[0].isString() and !args[0].isRegExp())) {
+            const fmt = "<d>expect.<r>stringContaining<d>(<r>string<d>)<r>\n\nExpected a string or regular expression\n";
+            globalObject.throwPretty(fmt, .{});
+            return .zero;
+        }
+
+        const test_value = args[0];
+        const string_matching = globalObject.bunVM().allocator.create(ExpectStringMatching) catch unreachable;
+
+        if (Jest.runner.?.pending_test == null) {
+            const err = globalObject.createErrorInstance("expect.stringContaining() must be called in a test", .{});
+            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
+            globalObject.throwValue(err);
+            return .zero;
+        }
+
+        const string_matching_js_value = string_matching.toJS(globalObject);
+        ExpectStringMatching.testValueSetCached(string_matching_js_value, globalObject, test_value);
+
+        var vm = globalObject.bunVM();
+        vm.autoGarbageCollect();
+        return string_matching_js_value;
+    }
+};
+
+pub const ExpectStringContaining = struct {
+    pub usingnamespace JSC.Codegen.JSExpectStringContaining;
+
+    pub fn finalize(
+        this: *ExpectStringContaining,
+    ) callconv(.C) void {
+        VirtualMachine.get().allocator.destroy(this);
+    }
+
+    pub fn call(globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
+        const args = callFrame.arguments(1).slice();
+
+        if (args.len == 0 or !args[0].isString()) {
+            const fmt = "<d>expect.<r>stringContaining<d>(<r>string<d>)<r>\n\nExpected a string\n";
+            globalObject.throwPretty(fmt, .{});
+            return .zero;
+        }
+
+        const string_value = args[0];
+
+        const string_containing = globalObject.bunVM().allocator.create(ExpectStringContaining) catch unreachable;
+
+        if (Jest.runner.?.pending_test == null) {
+            const err = globalObject.createErrorInstance("expect.stringContaining() must be called in a test", .{});
+            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
+            globalObject.throwValue(err);
+            return .zero;
+        }
+
+        const string_containing_js_value = string_containing.toJS(globalObject);
+        ExpectStringContaining.stringValueSetCached(string_containing_js_value, globalObject, string_value);
+
+        var vm = globalObject.bunVM();
+        vm.autoGarbageCollect();
+        return string_containing_js_value;
+    }
+};
+
+pub const ExpectAny = struct {
+    pub usingnamespace JSC.Codegen.JSExpectAny;
+
+    pub fn finalize(
+        this: *ExpectAny,
+    ) callconv(.C) void {
+        VirtualMachine.get().allocator.destroy(this);
+    }
+
+    pub fn call(globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
+        const _arguments = callFrame.arguments(1);
+        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
+
+        if (arguments.len == 0) {
+            globalObject.throw("any() expects to be passed a constructor function.", .{});
+            return .zero;
+        }
+
+        const constructor = arguments[0];
+        constructor.ensureStillAlive();
+        if (!constructor.isConstructor()) {
+            const fmt = "<d>expect.<r>any<d>(<r>constructor<d>)<r>\n\nExpected a constructor\n";
+            globalObject.throwPretty(fmt, .{});
+            return .zero;
+        }
+
+        var any = globalObject.bunVM().allocator.create(ExpectAny) catch unreachable;
+
+        if (Jest.runner.?.pending_test == null) {
+            const err = globalObject.createErrorInstance("expect.any() must be called in a test", .{});
+            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
+            globalObject.throwValue(err);
+            return .zero;
+        }
+
+        any.* = .{};
+        const any_js_value = any.toJS(globalObject);
+        any_js_value.ensureStillAlive();
+        ExpectAny.constructorValueSetCached(any_js_value, globalObject, constructor);
+        any_js_value.ensureStillAlive();
+
+        var vm = globalObject.bunVM();
+        vm.autoGarbageCollect();
+
+        return any_js_value;
+    }
+};
+
+/// JSValue.zero is used to indicate it was not a JSMockFunction
+/// If there were no calls, it returns an empty JSArray*
+extern fn JSMockFunction__getCalls(JSValue) JSValue;
+
+/// JSValue.zero is used to indicate it was not a JSMockFunction
+/// If there were no calls, it returns an empty JSArray*
+extern fn JSMockFunction__getReturns(JSValue) JSValue;
diff --git a/src/bun.js/test/jest.zig b/src/bun.js/test/jest.zig
index 00cc954ad..55600ded8 100644
--- a/src/bun.js/test/jest.zig
+++ b/src/bun.js/test/jest.zig
@@ -8,7 +8,12 @@ const MimeType = @import("../../http.zig").MimeType;
 const ZigURL = @import("../../url.zig").URL;
 const HTTPClient = @import("root").bun.HTTP;
 const NetworkThread = HTTPClient.NetworkThread;
-const Environment = @import("../../env.zig");
+const Environment = bun.Environment;
+
+const Snapshots = @import("./snapshot.zig").Snapshots;
+const expect = @import("./expect.zig");
+const Counter = expect.Counter;
+const Expect = expect.Expect;
 
 const DiffFormatter = @import("./diff_format.zig").DiffFormatter;
 
@@ -28,8 +33,6 @@ const default_allocator = @import("root").bun.default_allocator;
 const FeatureFlags = @import("root").bun.FeatureFlags;
 const ArrayBuffer = @import("../base.zig").ArrayBuffer;
 const Properties = @import("../base.zig").Properties;
-const d = @import("../base.zig").d;
-const castObj = @import("../base.zig").castObj;
 const getAllocator = @import("../base.zig").getAllocator;
 
 const ZigString = JSC.ZigString;
@@ -43,12 +46,10 @@ const JSObject = JSC.JSObject;
 const CallFrame = JSC.CallFrame;
 
 const VirtualMachine = JSC.VirtualMachine;
-const Task = @import("../javascript.zig").Task;
-
-const Fs = @import("../../fs.zig");
+const Fs = bun.fs;
 const is_bindgen: bool = std.meta.globalOption("bindgen", bool) orelse false;
 
-const ArrayIdentityContext = @import("../../identity_context.zig").ArrayIdentityContext;
+const ArrayIdentityContext = bun.ArrayIdentityContext;
 pub var test_elapsed_timer: ?*std.time.Timer = null;
 
 pub const Tag = enum(u3) {
@@ -210,13 +211,13 @@ pub const TestRunner = struct {
         const start = @truncate(Test.ID, this.tests.len);
         this.tests.len += count;
         var statuses = this.tests.items(.status)[start..][0..count];
-        std.mem.set(Test.Status, statuses, Test.Status.pending);
+        @memset(statuses, Test.Status.pending);
         this.callback.onUpdateCount(this.callback, count, count + start);
         return start;
     }
 
     pub fn getOrPutFile(this: *TestRunner, file_path: string) *DescribeScope {
-        var entry = this.index.getOrPut(this.allocator, @truncate(u32, std.hash.Wyhash.hash(0, file_path))) catch unreachable;
+        var entry = this.index.getOrPut(this.allocator, @truncate(u32, bun.hash(file_path))) catch unreachable;
         if (entry.found_existing) {
             return this.files.items(.module_scope)[entry.value_ptr.*];
         }
@@ -258,274 +259,6 @@ pub const TestRunner = struct {
     };
 };
 
-pub const Snapshots = struct {
-    const file_header = "// Bun Snapshot v1, https://goo.gl/fbAQLP\n";
-    pub const ValuesHashMap = std.HashMap(usize, string, bun.IdentityContext(usize), std.hash_map.default_max_load_percentage);
-
-    allocator: std.mem.Allocator,
-    update_snapshots: bool,
-    total: usize = 0,
-    added: usize = 0,
-    passed: usize = 0,
-    failed: usize = 0,
-
-    file_buf: *std.ArrayList(u8),
-    values: *ValuesHashMap,
-    counts: *bun.StringHashMap(usize),
-    _current_file: ?File = null,
-    snapshot_dir_path: ?string = null,
-
-    const File = struct {
-        id: TestRunner.File.ID,
-        file: std.fs.File,
-    };
-
-    pub fn getOrPut(this: *Snapshots, expect: *Expect, value: JSValue, hint: string, globalObject: *JSC.JSGlobalObject) !?string {
-        switch (try this.getSnapshotFile(expect.scope.file_id)) {
-            .result => {},
-            .err => |err| {
-                return switch (err.syscall) {
-                    .mkdir => error.FailedToMakeSnapshotDirectory,
-                    .open => error.FailedToOpenSnapshotFile,
-                    else => error.SnapshotFailed,
-                };
-            },
-        }
-
-        const snapshot_name = try expect.getSnapshotName(this.allocator, hint);
-        this.total += 1;
-
-        var count_entry = try this.counts.getOrPut(snapshot_name);
-        const counter = brk: {
-            if (count_entry.found_existing) {
-                this.allocator.free(snapshot_name);
-                count_entry.value_ptr.* += 1;
-                break :brk count_entry.value_ptr.*;
-            }
-            count_entry.value_ptr.* = 1;
-            break :brk count_entry.value_ptr.*;
-        };
-
-        const name = count_entry.key_ptr.*;
-
-        var counter_string_buf = [_]u8{0} ** 32;
-        var counter_string = try std.fmt.bufPrint(&counter_string_buf, "{d}", .{counter});
-
-        var name_with_counter = try this.allocator.alloc(u8, name.len + 1 + counter_string.len);
-        defer this.allocator.free(name_with_counter);
-        bun.copy(u8, name_with_counter[0..name.len], name);
-        name_with_counter[name.len] = ' ';
-        bun.copy(u8, name_with_counter[name.len + 1 ..], counter_string);
-
-        const name_hash = std.hash.Wyhash.hash(0, name_with_counter);
-        if (this.values.get(name_hash)) |expected| {
-            return expected;
-        }
-
-        // doesn't exist. append to file bytes and add to hashmap.
-        var pretty_value = try MutableString.init(this.allocator, 0);
-        try value.jestSnapshotPrettyFormat(&pretty_value, globalObject);
-
-        const serialized_length = "\nexports[`".len + name_with_counter.len + "`] = `".len + pretty_value.list.items.len + "`;\n".len;
-        try this.file_buf.ensureUnusedCapacity(serialized_length);
-        this.file_buf.appendSliceAssumeCapacity("\nexports[`");
-        this.file_buf.appendSliceAssumeCapacity(name_with_counter);
-        this.file_buf.appendSliceAssumeCapacity("`] = `");
-        this.file_buf.appendSliceAssumeCapacity(pretty_value.list.items);
-        this.file_buf.appendSliceAssumeCapacity("`;\n");
-
-        this.added += 1;
-        try this.values.put(name_hash, pretty_value.toOwnedSlice());
-        return null;
-    }
-
-    pub fn parseFile(this: *Snapshots) !void {
-        if (this.file_buf.items.len == 0) return;
-
-        const vm = VirtualMachine.get();
-        var opts = js_parser.Parser.Options.init(vm.bundler.options.jsx, .js);
-        var temp_log = logger.Log.init(this.allocator);
-
-        const test_file = Jest.runner.?.files.get(this._current_file.?.id);
-        const test_filename = test_file.source.path.name.filename;
-        const dir_path = test_file.source.path.name.dirWithTrailingSlash();
-
-        var snapshot_file_path_buf: [bun.MAX_PATH_BYTES]u8 = undefined;
-        var remain: []u8 = snapshot_file_path_buf[0..bun.MAX_PATH_BYTES];
-        bun.copy(u8, remain, dir_path);
-        remain = remain[dir_path.len..];
-        bun.copy(u8, remain, "__snapshots__/");
-        remain = remain["__snapshots__/".len..];
-        bun.copy(u8, remain, test_filename);
-        remain = remain[test_filename.len..];
-        bun.copy(u8, remain, ".snap");
-        remain = remain[".snap".len..];
-        remain[0] = 0;
-        const snapshot_file_path = snapshot_file_path_buf[0 .. snapshot_file_path_buf.len - remain.len :0];
-
-        const source = logger.Source.initPathString(snapshot_file_path, this.file_buf.items);
-
-        var parser = try js_parser.Parser.init(
-            opts,
-            &temp_log,
-            &source,
-            vm.bundler.options.define,
-            this.allocator,
-        );
-
-        var parse_result = try parser.parse();
-        var ast = if (parse_result == .ast) parse_result.ast else return error.ParseError;
-        defer ast.deinit();
-
-        if (ast.exports_ref.isNull()) return;
-        const exports_ref = ast.exports_ref;
-
-        // TODO: when common js transform changes, keep this updated or add flag to support this version
-
-        const export_default = brk: {
-            for (ast.parts.slice()) |part| {
-                for (part.stmts) |stmt| {
-                    if (stmt.data == .s_export_default and stmt.data.s_export_default.value == .expr) {
-                        break :brk stmt.data.s_export_default.value.expr;
-                    }
-                }
-            }
-
-            return;
-        };
-
-        if (export_default.data == .e_call) {
-            const function_call = export_default.data.e_call;
-            if (function_call.args.len == 2 and function_call.args.ptr[0].data == .e_function) {
-                const arg_function_stmts = function_call.args.ptr[0].data.e_function.func.body.stmts;
-                for (arg_function_stmts) |stmt| {
-                    switch (stmt.data) {
-                        .s_expr => |expr| {
-                            if (expr.value.data == .e_binary and expr.value.data.e_binary.op == .bin_assign) {
-                                const left = expr.value.data.e_binary.left;
-                                if (left.data == .e_index and left.data.e_index.index.data == .e_string and left.data.e_index.target.data == .e_identifier) {
-                                    const target: js_ast.E.Identifier = left.data.e_index.target.data.e_identifier;
-                                    var index: *js_ast.E.String = left.data.e_index.index.data.e_string;
-                                    if (target.ref.eql(exports_ref) and expr.value.data.e_binary.right.data == .e_string) {
-                                        const key = index.slice(this.allocator);
-                                        var value_string = expr.value.data.e_binary.right.data.e_string;
-                                        const value = value_string.slice(this.allocator);
-                                        defer {
-                                            if (!index.isUTF8()) this.allocator.free(key);
-                                            if (!value_string.isUTF8()) this.allocator.free(value);
-                                        }
-                                        const value_clone = try this.allocator.alloc(u8, value.len);
-                                        bun.copy(u8, value_clone, value);
-                                        const name_hash = std.hash.Wyhash.hash(0, key);
-                                        try this.values.put(name_hash, value_clone);
-                                    }
-                                }
-                            }
-                        },
-                        else => {},
-                    }
-                }
-            }
-        }
-    }
-
-    pub fn writeSnapshotFile(this: *Snapshots) !void {
-        if (this._current_file) |_file| {
-            var file = _file;
-            file.file.writeAll(this.file_buf.items) catch {
-                return error.FailedToWriteSnapshotFile;
-            };
-            file.file.close();
-            this.file_buf.clearAndFree();
-
-            var value_itr = this.values.valueIterator();
-            while (value_itr.next()) |value| {
-                this.allocator.free(value.*);
-            }
-            this.values.clearAndFree();
-
-            var count_key_itr = this.counts.keyIterator();
-            while (count_key_itr.next()) |key| {
-                this.allocator.free(key.*);
-            }
-            this.counts.clearAndFree();
-        }
-    }
-
-    fn getSnapshotFile(this: *Snapshots, file_id: TestRunner.File.ID) !JSC.Maybe(void) {
-        if (this._current_file == null or this._current_file.?.id != file_id) {
-            try this.writeSnapshotFile();
-
-            const test_file = Jest.runner.?.files.get(file_id);
-            const test_filename = test_file.source.path.name.filename;
-            const dir_path = test_file.source.path.name.dirWithTrailingSlash();
-
-            var snapshot_file_path_buf: [bun.MAX_PATH_BYTES]u8 = undefined;
-            var remain: []u8 = snapshot_file_path_buf[0..bun.MAX_PATH_BYTES];
-            bun.copy(u8, remain, dir_path);
-            remain = remain[dir_path.len..];
-            bun.copy(u8, remain, "__snapshots__/");
-            remain = remain["__snapshots__/".len..];
-
-            if (this.snapshot_dir_path == null or !strings.eqlLong(dir_path, this.snapshot_dir_path.?, true)) {
-                remain[0] = 0;
-                const snapshot_dir_path = snapshot_file_path_buf[0 .. snapshot_file_path_buf.len - remain.len :0];
-                switch (JSC.Node.Syscall.mkdir(snapshot_dir_path, 0o777)) {
-                    .result => this.snapshot_dir_path = dir_path,
-                    .err => |err| {
-                        switch (err.getErrno()) {
-                            std.os.E.EXIST => this.snapshot_dir_path = dir_path,
-                            else => return JSC.Maybe(void){
-                                .err = err,
-                            },
-                        }
-                    },
-                }
-            }
-
-            bun.copy(u8, remain, test_filename);
-            remain = remain[test_filename.len..];
-            bun.copy(u8, remain, ".snap");
-            remain = remain[".snap".len..];
-            remain[0] = 0;
-            const snapshot_file_path = snapshot_file_path_buf[0 .. snapshot_file_path_buf.len - remain.len :0];
-
-            var flags: JSC.Node.Mode = std.os.O.CREAT | std.os.O.RDWR;
-            if (this.update_snapshots) flags |= std.os.O.TRUNC;
-            const fd = switch (JSC.Node.Syscall.open(snapshot_file_path, flags, 0o644)) {
-                .result => |_fd| _fd,
-                .err => |err| return JSC.Maybe(void){
-                    .err = err,
-                },
-            };
-
-            var file: File = .{
-                .id = file_id,
-                .file = .{ .handle = fd },
-            };
-
-            if (this.update_snapshots) {
-                try this.file_buf.appendSlice(file_header);
-            } else {
-                const length = try file.file.getEndPos();
-                if (length == 0) {
-                    try this.file_buf.appendSlice(file_header);
-                } else {
-                    const buf = try this.allocator.alloc(u8, length);
-                    _ = try file.file.preadAll(buf, 0);
-                    try this.file_buf.appendSlice(buf);
-                    this.allocator.free(buf);
-                }
-            }
-
-            this._current_file = file;
-            try this.parseFile();
-        }
-
-        return JSC.Maybe(void).success;
-    }
-};
-
 pub const Jest = struct {
     pub var runner: ?*TestRunner = null;
 
@@ -600,7 +333,7 @@ pub const Jest = struct {
     pub fn Bun__Jest__createTestModuleObject(globalObject: *JSC.JSGlobalObject) callconv(.C) JSC.JSValue {
         JSC.markBinding(@src());
 
-        const module = JSC.JSValue.createEmptyObject(globalObject, 11);
+        const module = JSC.JSValue.createEmptyObject(globalObject, 13);
 
         const test_fn = JSC.NewFunction(globalObject, ZigString.static("test"), 2, TestScope.call, false);
         module.put(
@@ -698,31 +431,62 @@ pub const Jest = struct {
             Expect.getConstructor(globalObject),
         );
 
-        const mock_fn = JSMockFunction__createObject(globalObject);
-        const spyOn = JSC.NewFunction(globalObject, ZigString.static("spyOn"), 2, JSMock__spyOn, false);
-        const restoreAllMocks = JSC.NewFunction(globalObject, ZigString.static("restoreAllMocks"), 2, jsFunctionResetSpies, false);
-        module.put(globalObject, ZigString.static("mock"), mock_fn);
+        const setSystemTime = JSC.NewFunction(globalObject, ZigString.static("setSystemTime"), 0, JSMock__jsSetSystemTime, false);
+        module.put(
+            globalObject,
+            ZigString.static("setSystemTime"),
+            setSystemTime,
+        );
+        const useFakeTimers = JSC.NewFunction(globalObject, ZigString.static("useFakeTimers"), 0, JSMock__jsUseFakeTimers, false);
+        const useRealTimers = JSC.NewFunction(globalObject, ZigString.static("useRealTimers"), 0, JSMock__jsUseRealTimers, false);
+
+        const mockFn = JSC.NewFunction(globalObject, ZigString.static("fn"), 1, JSMock__jsMockFn, false);
+        const spyOn = JSC.NewFunction(globalObject, ZigString.static("spyOn"), 2, JSMock__jsSpyOn, false);
+        const restoreAllMocks = JSC.NewFunction(globalObject, ZigString.static("restoreAllMocks"), 2, JSMock__jsRestoreAllMocks, false);
+        module.put(globalObject, ZigString.static("mock"), mockFn);
 
-        const jest = JSValue.createEmptyObject(globalObject, 3);
-        jest.put(globalObject, ZigString.static("fn"), mock_fn);
+        const jest = JSValue.createEmptyObject(globalObject, 7);
+        jest.put(globalObject, ZigString.static("fn"), mockFn);
         jest.put(globalObject, ZigString.static("spyOn"), spyOn);
         jest.put(globalObject, ZigString.static("restoreAllMocks"), restoreAllMocks);
+        jest.put(
+            globalObject,
+            ZigString.static("setSystemTime"),
+            setSystemTime,
+        );
+        jest.put(
+            globalObject,
+            ZigString.static("useFakeTimers"),
+            useFakeTimers,
+        );
+        jest.put(
+            globalObject,
+            ZigString.static("useRealTimers"),
+            useRealTimers,
+        );
+        jest.put(globalObject, ZigString.static("now"), JSC.NewFunction(globalObject, ZigString.static("now"), 0, JSMock__jsNow, false));
+
         module.put(globalObject, ZigString.static("jest"), jest);
         module.put(globalObject, ZigString.static("spyOn"), spyOn);
 
-        const vi = JSValue.createEmptyObject(globalObject, 1);
-        vi.put(globalObject, ZigString.static("fn"), mock_fn);
+        const vi = JSValue.createEmptyObject(globalObject, 3);
+        vi.put(globalObject, ZigString.static("fn"), mockFn);
+        vi.put(globalObject, ZigString.static("spyOn"), spyOn);
+        vi.put(globalObject, ZigString.static("restoreAllMocks"), restoreAllMocks);
         module.put(globalObject, ZigString.static("vi"), vi);
 
         return module;
     }
 
-    extern fn JSMockFunction__createObject(*JSC.JSGlobalObject) JSC.JSValue;
-
     extern fn Bun__Jest__testPreloadObject(*JSC.JSGlobalObject) JSC.JSValue;
     extern fn Bun__Jest__testModuleObject(*JSC.JSGlobalObject) JSC.JSValue;
-    extern fn jsFunctionResetSpies(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
-    extern fn JSMock__spyOn(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsMockFn(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsNow(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsSetSystemTime(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsRestoreAllMocks(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsSpyOn(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsUseFakeTimers(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
+    extern fn JSMock__jsUseRealTimers(*JSC.JSGlobalObject, *JSC.CallFrame) JSC.JSValue;
 
     pub fn call(
         _: void,
@@ -759,8 +523,7 @@ pub const Jest = struct {
         var filepath = Fs.FileSystem.instance.filename_store.append([]const u8, slice) catch unreachable;
 
         var scope = runner_.getOrPutFile(filepath);
-        DescribeScope.active = scope;
-        DescribeScope.module = scope;
+        scope.push();
 
         return Bun__Jest__testModuleObject(ctx).asObjectRef();
     }
@@ -773,3523 +536,6 @@ pub const Jest = struct {
     }
 };
 
-pub const ExpectAnything = struct {
-    pub usingnamespace JSC.Codegen.JSExpectAnything;
-
-    pub fn finalize(
-        this: *ExpectAnything,
-    ) callconv(.C) void {
-        VirtualMachine.get().allocator.destroy(this);
-    }
-
-    pub fn call(globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSValue {
-        const anything = globalObject.bunVM().allocator.create(ExpectAnything) catch unreachable;
-        if (Jest.runner.?.pending_test == null) {
-            const err = globalObject.createErrorInstance("expect.anything() must be called in a test", .{});
-            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
-            globalObject.throwValue(err);
-            return .zero;
-        }
-
-        const anything_js_value = anything.toJS(globalObject);
-        anything_js_value.ensureStillAlive();
-
-        var vm = globalObject.bunVM();
-        vm.autoGarbageCollect();
-
-        return anything_js_value;
-    }
-};
-
-pub const ExpectStringMatching = struct {
-    pub usingnamespace JSC.Codegen.JSExpectStringMatching;
-
-    pub fn finalize(
-        this: *ExpectStringMatching,
-    ) callconv(.C) void {
-        VirtualMachine.get().allocator.destroy(this);
-    }
-
-    pub fn call(globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        const args = callFrame.arguments(1).slice();
-
-        if (args.len == 0 or (!args[0].isString() and !args[0].isRegExp())) {
-            const fmt = "<d>expect.<r>stringContaining<d>(<r>string<d>)<r>\n\nExpected a string or regular expression\n";
-            globalObject.throwPretty(fmt, .{});
-            return .zero;
-        }
-
-        const test_value = args[0];
-        const string_matching = globalObject.bunVM().allocator.create(ExpectStringMatching) catch unreachable;
-
-        if (Jest.runner.?.pending_test == null) {
-            const err = globalObject.createErrorInstance("expect.stringContaining() must be called in a test", .{});
-            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
-            globalObject.throwValue(err);
-            return .zero;
-        }
-
-        const string_matching_js_value = string_matching.toJS(globalObject);
-        ExpectStringMatching.testValueSetCached(string_matching_js_value, globalObject, test_value);
-
-        var vm = globalObject.bunVM();
-        vm.autoGarbageCollect();
-        return string_matching_js_value;
-    }
-};
-
-pub const ExpectStringContaining = struct {
-    pub usingnamespace JSC.Codegen.JSExpectStringContaining;
-
-    pub fn finalize(
-        this: *ExpectStringContaining,
-    ) callconv(.C) void {
-        VirtualMachine.get().allocator.destroy(this);
-    }
-
-    pub fn call(globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        const args = callFrame.arguments(1).slice();
-
-        if (args.len == 0 or !args[0].isString()) {
-            const fmt = "<d>expect.<r>stringContaining<d>(<r>string<d>)<r>\n\nExpected a string\n";
-            globalObject.throwPretty(fmt, .{});
-            return .zero;
-        }
-
-        const string_value = args[0];
-
-        const string_containing = globalObject.bunVM().allocator.create(ExpectStringContaining) catch unreachable;
-
-        if (Jest.runner.?.pending_test == null) {
-            const err = globalObject.createErrorInstance("expect.stringContaining() must be called in a test", .{});
-            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
-            globalObject.throwValue(err);
-            return .zero;
-        }
-
-        const string_containing_js_value = string_containing.toJS(globalObject);
-        ExpectStringContaining.stringValueSetCached(string_containing_js_value, globalObject, string_value);
-
-        var vm = globalObject.bunVM();
-        vm.autoGarbageCollect();
-        return string_containing_js_value;
-    }
-};
-pub const ExpectAny = struct {
-    pub usingnamespace JSC.Codegen.JSExpectAny;
-
-    pub fn finalize(
-        this: *ExpectAny,
-    ) callconv(.C) void {
-        VirtualMachine.get().allocator.destroy(this);
-    }
-
-    pub fn call(globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len == 0) {
-            globalObject.throw("any() expects to be passed a constructor function.", .{});
-            return .zero;
-        }
-
-        const constructor = arguments[0];
-        constructor.ensureStillAlive();
-        if (!constructor.isConstructor()) {
-            const fmt = "<d>expect.<r>any<d>(<r>constructor<d>)<r>\n\nExpected a constructor\n";
-            globalObject.throwPretty(fmt, .{});
-            return .zero;
-        }
-
-        var any = globalObject.bunVM().allocator.create(ExpectAny) catch unreachable;
-
-        if (Jest.runner.?.pending_test == null) {
-            const err = globalObject.createErrorInstance("expect.any() must be called in a test", .{});
-            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
-            globalObject.throwValue(err);
-            return .zero;
-        }
-
-        any.* = .{};
-        const any_js_value = any.toJS(globalObject);
-        any_js_value.ensureStillAlive();
-        ExpectAny.constructorValueSetCached(any_js_value, globalObject, constructor);
-        any_js_value.ensureStillAlive();
-
-        var vm = globalObject.bunVM();
-        vm.autoGarbageCollect();
-
-        return any_js_value;
-    }
-};
-
-/// https://jestjs.io/docs/expect
-// To support async tests, we need to track the test ID
-pub const Expect = struct {
-    test_id: TestRunner.Test.ID,
-    scope: *DescribeScope,
-    op: Op.Set = Op.Set.init(.{}),
-
-    pub usingnamespace JSC.Codegen.JSExpect;
-
-    pub const Op = enum(u3) {
-        resolves,
-        rejects,
-        not,
-        pub const Set = std.EnumSet(Op);
-    };
-
-    pub fn getSnapshotName(this: *Expect, allocator: std.mem.Allocator, hint: string) ![]const u8 {
-        const test_name = this.scope.tests.items[this.test_id].label;
-
-        var length: usize = 0;
-        var curr_scope: ?*DescribeScope = this.scope;
-        while (curr_scope) |scope| {
-            if (scope.label.len > 0) {
-                length += scope.label.len + 1;
-            }
-            curr_scope = scope.parent;
-        }
-        length += test_name.len;
-        if (hint.len > 0) {
-            length += hint.len + 2;
-        }
-
-        var buf = try allocator.alloc(u8, length);
-
-        var index = buf.len;
-        if (hint.len > 0) {
-            index -= hint.len;
-            bun.copy(u8, buf[index..], hint);
-            index -= test_name.len + 2;
-            bun.copy(u8, buf[index..], test_name);
-            bun.copy(u8, buf[index + test_name.len ..], ": ");
-        } else {
-            index -= test_name.len;
-            bun.copy(u8, buf[index..], test_name);
-        }
-        // copy describe scopes in reverse order
-        curr_scope = this.scope;
-        while (curr_scope) |scope| {
-            if (scope.label.len > 0) {
-                index -= scope.label.len + 1;
-                bun.copy(u8, buf[index..], scope.label);
-                buf[index + scope.label.len] = ' ';
-            }
-            curr_scope = scope.parent;
-        }
-
-        return buf;
-    }
-
-    pub fn finalize(
-        this: *Expect,
-    ) callconv(.C) void {
-        VirtualMachine.get().allocator.destroy(this);
-    }
-
-    pub fn call(globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        const arguments = callframe.arguments(1);
-        const value = if (arguments.len < 1) JSC.JSValue.jsUndefined() else arguments.ptr[0];
-
-        var expect = globalObject.bunVM().allocator.create(Expect) catch unreachable;
-
-        if (Jest.runner.?.pending_test == null) {
-            const err = globalObject.createErrorInstance("expect() must be called in a test", .{});
-            err.put(globalObject, ZigString.static("name"), ZigString.init("TestNotRunningError").toValueGC(globalObject));
-            globalObject.throwValue(err);
-            return .zero;
-        }
-
-        expect.* = .{
-            .scope = Jest.runner.?.pending_test.?.describe,
-            .test_id = Jest.runner.?.pending_test.?.test_id,
-        };
-        const expect_js_value = expect.toJS(globalObject);
-        expect_js_value.ensureStillAlive();
-        JSC.Jest.Expect.capturedValueSetCached(expect_js_value, globalObject, value);
-        expect_js_value.ensureStillAlive();
-        expect.postMatch(globalObject);
-        return expect_js_value;
-    }
-
-    pub fn constructor(
-        globalObject: *JSC.JSGlobalObject,
-        callframe: *JSC.CallFrame,
-    ) callconv(.C) ?*Expect {
-        _ = callframe.arguments(1);
-        globalObject.throw("expect() cannot be called with new", .{});
-        return null;
-    }
-
-    /// Object.is()
-    pub fn toBe(
-        this: *Expect,
-        globalObject: *JSC.JSGlobalObject,
-        callframe: *JSC.CallFrame,
-    ) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-        const thisValue = callframe.this();
-        const arguments_ = callframe.arguments(1);
-        const arguments = arguments_.ptr[0..arguments_.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBe() takes 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBe() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-        const right = arguments[0];
-        right.ensureStillAlive();
-        const left = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        left.ensureStillAlive();
-
-        const not = this.op.contains(.not);
-        var pass = right.isSameValue(left, globalObject);
-        if (comptime Environment.allow_assert) {
-            std.debug.assert(pass == JSC.C.JSValueIsStrictEqual(globalObject, right.asObjectRef(), left.asObjectRef()));
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        if (not) {
-            const signature = comptime getSignature("toBe", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\nExpected: not <green>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{right.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{right.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toBe", "<green>expected<r>", false);
-        if (left.deepEquals(right, globalObject) or left.strictDeepEquals(right, globalObject)) {
-            const fmt = signature ++
-                "\n\n<d>If this test should pass, replace \"toBe\" with \"toEqual\" or \"toStrictEqual\"<r>" ++
-                "\n\nExpected: <green>{any}<r>\n" ++
-                "Received: serializes to the same string\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{right.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{right.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        if (right.isString() and left.isString()) {
-            const diff_format = DiffFormatter{
-                .expected = right,
-                .received = left,
-                .globalObject = globalObject,
-                .not = not,
-            };
-            const fmt = signature ++ "\n\n{any}\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{diff_format});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{diff_format});
-            return .zero;
-        }
-
-        const fmt = signature ++ "\n\nExpected: <green>{any}<r>\nReceived: <red>{any}<r>\n";
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{
-                right.toFmt(globalObject, &formatter),
-                left.toFmt(globalObject, &formatter),
-            });
-            return .zero;
-        }
-        globalObject.throw(Output.prettyFmt(fmt, false), .{
-            right.toFmt(globalObject, &formatter),
-            left.toFmt(globalObject, &formatter),
-        });
-        return .zero;
-    }
-
-    pub fn getSignature(comptime matcher_name: string, comptime args: string, comptime not: bool) string {
-        const received = "<d>expect(<r><red>received<r><d>).<r>";
-        comptime if (not) {
-            return received ++ "not<d>.<r>" ++ matcher_name ++ "<d>(<r>" ++ args ++ "<d>)<r>";
-        };
-        return received ++ matcher_name ++ "<d>(<r>" ++ args ++ "<d>)<r>";
-    }
-
-    pub fn toHaveLength(
-        this: *Expect,
-        globalObject: *JSC.JSGlobalObject,
-        callframe: *JSC.CallFrame,
-    ) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-        const thisValue = callframe.this();
-        const arguments_ = callframe.arguments(1);
-        const arguments = arguments_.ptr[0..arguments_.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toHaveLength() takes 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toHaveLength() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const expected: JSValue = arguments[0];
-        const value: JSValue = JSC.Jest.Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (!value.isObject() and !value.isString()) {
-            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-            globalObject.throw("Received value does not have a length property: {any}", .{value.toFmt(globalObject, &fmt)});
-            return .zero;
-        }
-
-        if (!expected.isNumber()) {
-            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-            globalObject.throw("Expected value must be a non-negative integer: {any}", .{expected.toFmt(globalObject, &fmt)});
-            return .zero;
-        }
-
-        const expected_length: f64 = expected.asNumber();
-        if (@round(expected_length) != expected_length or std.math.isInf(expected_length) or std.math.isNan(expected_length) or expected_length < 0) {
-            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-            globalObject.throw("Expected value must be a non-negative integer: {any}", .{expected.toFmt(globalObject, &fmt)});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        const actual_length = value.getLengthIfPropertyExistsInternal(globalObject);
-
-        if (actual_length == std.math.inf(f64)) {
-            var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-            globalObject.throw("Received value does not have a length property: {any}", .{value.toFmt(globalObject, &fmt)});
-            return .zero;
-        } else if (std.math.isNan(actual_length)) {
-            globalObject.throw("Received value has non-number length property: {}", .{actual_length});
-            return .zero;
-        }
-
-        if (actual_length == expected_length) {
-            pass = true;
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        if (not) {
-            const expected_line = "Expected length: not <green>{d}<r>\n";
-            const fmt = comptime getSignature("toHaveLength", "<green>expected<r>", true) ++ "\n\n" ++ expected_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_length});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_length});
-            return .zero;
-        }
-
-        const expected_line = "Expected length: <green>{d}<r>\n";
-        const received_line = "Received length: <red>{d}<r>\n";
-        const fmt = comptime getSignature("toHaveLength", "<green>expected<r>", false) ++ "\n\n" ++
-            expected_line ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_length, actual_length });
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_length, actual_length });
-        return .zero;
-    }
-
-    pub fn toContain(
-        this: *Expect,
-        globalObject: *JSC.JSGlobalObject,
-        callFrame: *JSC.CallFrame,
-    ) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-        const thisValue = callFrame.this();
-        const arguments_ = callFrame.arguments(1);
-        const arguments = arguments_.ptr[0..arguments_.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toContain() takes 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toContain() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const expected = arguments[0];
-        expected.ensureStillAlive();
-        const value: JSValue = JSC.Jest.Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (value.isIterable(globalObject)) {
-            var itr = value.arrayIterator(globalObject);
-            while (itr.next()) |item| {
-                if (item.isSameValue(expected, globalObject)) {
-                    pass = true;
-                    break;
-                }
-            }
-        } else if (value.isString() and expected.isString()) {
-            const value_string = value.toString(globalObject).toSlice(globalObject, default_allocator).slice();
-            const expected_string = expected.toString(globalObject).toSlice(globalObject, default_allocator).slice();
-            if (strings.contains(value_string, expected_string)) {
-                pass = true;
-            } else if (value_string.len == 0 and expected_string.len == 0) { // edge case two empty strings are true
-                pass = true;
-            }
-        } else {
-            globalObject.throw("Received value must be an array type, or both received and expected values must be strings.", .{});
-            return .zero;
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        const expected_fmt = expected.toFmt(globalObject, &formatter);
-        if (not) {
-            const expected_line = "Expected to contain: not <green>{any}<r>\n";
-            const fmt = comptime getSignature("toContain", "<green>expected<r>", true) ++ "\n\n" ++ expected_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_fmt});
-            return .zero;
-        }
-
-        const expected_line = "Expected to contain: <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toContain", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toBeTruthy(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-        const thisValue = callFrame.this();
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeTruthy() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        const truthy = value.toBooleanSlow(globalObject);
-        if (truthy) pass = true;
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeTruthy", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeTruthy", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toBeUndefined(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-        const thisValue = callFrame.this();
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-        if (value.isUndefined()) pass = true;
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeUndefined", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeUndefined", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toBeNaN(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-        if (value.isNumber()) {
-            const number = value.asNumber();
-            if (number != number) pass = true;
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeNaN", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeNaN", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toBeNull(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = value.isNull();
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeNull", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeNull", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toBeDefined(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = !value.isUndefined();
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeDefined", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeDefined", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toBeFalsy(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeFalsy() must be called in a test", .{});
-            return .zero;
-        }
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        const truthy = value.toBooleanSlow(globalObject);
-        if (!truthy) pass = true;
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeFalsy", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeFalsy", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toEqual() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toEqual() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const expected = arguments[0];
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        const not = this.op.contains(.not);
-        var pass = value.jestDeepEquals(expected, globalObject);
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        const diff_formatter = DiffFormatter{
-            .received = value,
-            .expected = expected,
-            .globalObject = globalObject,
-            .not = not,
-        };
-
-        if (not) {
-            const signature = comptime getSignature("toEqual", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\n{any}\n";
-            globalObject.throwPretty(fmt, .{diff_formatter});
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toEqual", "<green>expected<r>", false);
-        const fmt = signature ++ "\n\n{any}\n";
-        globalObject.throwPretty(fmt, .{diff_formatter});
-        return .zero;
-    }
-
-    pub fn toStrictEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toStrictEqual() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toStrictEqual() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const expected = arguments[0];
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        const not = this.op.contains(.not);
-        var pass = value.jestStrictDeepEquals(expected, globalObject);
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        const diff_formatter = DiffFormatter{ .received = value, .expected = expected, .globalObject = globalObject, .not = not };
-
-        if (not) {
-            const signature = comptime getSignature("toStrictEqual", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\n{any}\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{diff_formatter});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{diff_formatter});
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toStrictEqual", "<green>expected<r>", false);
-        const fmt = signature ++ "\n\n{any}\n";
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{diff_formatter});
-            return .zero;
-        }
-        globalObject.throw(Output.prettyFmt(fmt, false), .{diff_formatter});
-        return .zero;
-    }
-
-    pub fn toHaveProperty(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(2);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toHaveProperty() requires at least 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toHaveProperty must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const expected_property_path = arguments[0];
-        expected_property_path.ensureStillAlive();
-        const expected_property: ?JSValue = if (arguments.len > 1) arguments[1] else null;
-        if (expected_property) |ev| ev.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (!expected_property_path.isString() and !expected_property_path.isIterable(globalObject)) {
-            globalObject.throw("Expected path must be a string or an array", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var path_string = ZigString.Empty;
-        expected_property_path.toZigString(&path_string, globalObject);
-
-        var pass = !value.isUndefinedOrNull();
-        var received_property: JSValue = .zero;
-
-        if (pass) {
-            received_property = value.getIfPropertyExistsFromPath(globalObject, expected_property_path);
-            pass = !received_property.isEmpty();
-        }
-
-        if (pass and expected_property != null) {
-            pass = received_property.jestDeepEquals(expected_property.?, globalObject);
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        if (not) {
-            if (expected_property != null) {
-                const signature = comptime getSignature("toHaveProperty", "<green>path<r><d>, <r><green>value<r>", true);
-                if (!received_property.isEmpty()) {
-                    const fmt = signature ++ "\n\nExpected path: <green>{any}<r>\n\nExpected value: not <green>{any}<r>\n";
-                    if (Output.enable_ansi_colors) {
-                        globalObject.throw(Output.prettyFmt(fmt, true), .{
-                            expected_property_path.toFmt(globalObject, &formatter),
-                            expected_property.?.toFmt(globalObject, &formatter),
-                        });
-                        return .zero;
-                    }
-                    globalObject.throw(Output.prettyFmt(fmt, true), .{
-                        expected_property_path.toFmt(globalObject, &formatter),
-                        expected_property.?.toFmt(globalObject, &formatter),
-                    });
-                    return .zero;
-                }
-            }
-
-            const signature = comptime getSignature("toHaveProperty", "<green>path<r>", true);
-            const fmt = signature ++ "\n\nExpected path: not <green>{any}<r>\n\nReceived value: <red>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{
-                    expected_property_path.toFmt(globalObject, &formatter),
-                    received_property.toFmt(globalObject, &formatter),
-                });
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{
-                expected_property_path.toFmt(globalObject, &formatter),
-                received_property.toFmt(globalObject, &formatter),
-            });
-            return .zero;
-        }
-
-        if (expected_property != null) {
-            const signature = comptime getSignature("toHaveProperty", "<green>path<r><d>, <r><green>value<r>", false);
-            if (!received_property.isEmpty()) {
-                // deep equal case
-                const fmt = signature ++ "\n\n{any}\n";
-                const diff_format = DiffFormatter{
-                    .received = received_property,
-                    .expected = expected_property.?,
-                    .globalObject = globalObject,
-                };
-
-                if (Output.enable_ansi_colors) {
-                    globalObject.throw(Output.prettyFmt(fmt, true), .{diff_format});
-                    return .zero;
-                }
-                globalObject.throw(Output.prettyFmt(fmt, false), .{diff_format});
-                return .zero;
-            }
-
-            const fmt = signature ++ "\n\nExpected path: <green>{any}<r>\n\nExpected value: <green>{any}<r>\n\n" ++
-                "Unable to find property\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{
-                    expected_property_path.toFmt(globalObject, &formatter),
-                    expected_property.?.toFmt(globalObject, &formatter),
-                });
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{
-                expected_property_path.toFmt(globalObject, &formatter),
-                expected_property.?.toFmt(globalObject, &formatter),
-            });
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toHaveProperty", "<green>path<r>", false);
-        const fmt = signature ++ "\n\nExpected path: <green>{any}<r>\n\nUnable to find property\n";
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{expected_property_path.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-        globalObject.throw(Output.prettyFmt(fmt, false), .{expected_property_path.toFmt(globalObject, &formatter)});
-        return .zero;
-    }
-
-    pub fn toBeEven(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeEven() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (value.isAnyInt()) {
-            const _value = value.toInt64();
-            pass = @mod(_value, 2) == 0;
-            if (_value == -0) { // negative zero is even
-                pass = true;
-            }
-        } else if (value.isBigInt() or value.isBigInt32()) {
-            const _value = value.toInt64();
-            pass = switch (_value == -0) { // negative zero is even
-                true => true,
-                else => _value & 1 == 0,
-            };
-        } else if (value.isNumber()) {
-            const _value = JSValue.asNumber(value);
-            if (@mod(_value, 1) == 0 and @mod(_value, 2) == 0) { // if the fraction is all zeros and even
-                pass = true;
-            } else {
-                pass = false;
-            }
-        } else {
-            pass = false;
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeEven", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeEven", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toBeGreaterThan(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBeGreaterThan() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeGreaterThan() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const other_value = arguments[0];
-        other_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
-            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (!value.isBigInt() and !other_value.isBigInt()) {
-            pass = value.asNumber() > other_value.asNumber();
-        } else if (value.isBigInt()) {
-            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
-                .greater_than => true,
-                else => pass,
-            };
-        } else {
-            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
-                .less_than => true,
-                else => pass,
-            };
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        const expected_fmt = other_value.toFmt(globalObject, &formatter);
-        if (not) {
-            const expected_line = "Expected: not \\> <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeGreaterThan", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected: \\> <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeGreaterThan", "<green>expected<r>", false) ++ "\n\n" ++
-            expected_line ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toBeGreaterThanOrEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBeGreaterThanOrEqual() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeGreaterThanOrEqual() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const other_value = arguments[0];
-        other_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
-            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (!value.isBigInt() and !other_value.isBigInt()) {
-            pass = value.asNumber() >= other_value.asNumber();
-        } else if (value.isBigInt()) {
-            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
-                .greater_than, .equal => true,
-                else => pass,
-            };
-        } else {
-            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
-                .less_than, .equal => true,
-                else => pass,
-            };
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        const expected_fmt = other_value.toFmt(globalObject, &formatter);
-        if (not) {
-            const expected_line = "Expected: not \\>= <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeGreaterThanOrEqual", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected: \\>= <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeGreaterThanOrEqual", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-        return .zero;
-    }
-
-    pub fn toBeLessThan(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBeLessThan() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeLessThan() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const other_value = arguments[0];
-        other_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
-            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (!value.isBigInt() and !other_value.isBigInt()) {
-            pass = value.asNumber() < other_value.asNumber();
-        } else if (value.isBigInt()) {
-            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
-                .less_than => true,
-                else => pass,
-            };
-        } else {
-            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
-                .greater_than => true,
-                else => pass,
-            };
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        const expected_fmt = other_value.toFmt(globalObject, &formatter);
-        if (not) {
-            const expected_line = "Expected: not \\< <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeLessThan", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected: \\< <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeLessThan", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-        return .zero;
-    }
-
-    pub fn toBeLessThanOrEqual(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBeLessThanOrEqual() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeLessThanOrEqual() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const other_value = arguments[0];
-        other_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if ((!value.isNumber() and !value.isBigInt()) or (!other_value.isNumber() and !other_value.isBigInt())) {
-            globalObject.throw("Expected and actual values must be numbers or bigints", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (!value.isBigInt() and !other_value.isBigInt()) {
-            pass = value.asNumber() <= other_value.asNumber();
-        } else if (value.isBigInt()) {
-            pass = switch (value.asBigIntCompare(globalObject, other_value)) {
-                .less_than, .equal => true,
-                else => pass,
-            };
-        } else {
-            pass = switch (other_value.asBigIntCompare(globalObject, value)) {
-                .greater_than, .equal => true,
-                else => pass,
-            };
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        const expected_fmt = other_value.toFmt(globalObject, &formatter);
-        if (not) {
-            const expected_line = "Expected: not \\<= <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeLessThanOrEqual", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected: \\<= <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeLessThanOrEqual", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(comptime Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-        return .zero;
-    }
-
-    pub fn toBeCloseTo(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const thisArguments = callFrame.arguments(2);
-        const arguments = thisArguments.ptr[0..thisArguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBeCloseTo() requires at least 1 argument. Expected value must be a number", .{});
-            return .zero;
-        }
-
-        const expected_ = arguments[0];
-        if (!expected_.isNumber()) {
-            globalObject.throwInvalidArgumentType("toBeCloseTo", "expected", "number");
-            return .zero;
-        }
-
-        var precision: f64 = 2.0;
-        if (arguments.len > 1) {
-            const precision_ = arguments[1];
-            if (!precision_.isNumber()) {
-                globalObject.throwInvalidArgumentType("toBeCloseTo", "precision", "number");
-                return .zero;
-            }
-
-            precision = precision_.asNumber();
-        }
-
-        const received_: JSC.JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        if (!received_.isNumber()) {
-            globalObject.throwInvalidArgumentType("expect", "received", "number");
-            return .zero;
-        }
-
-        var expected = expected_.asNumber();
-        var received = received_.asNumber();
-
-        if (std.math.isNegativeInf(expected)) {
-            expected = -expected;
-        }
-
-        if (std.math.isNegativeInf(received)) {
-            received = -received;
-        }
-
-        if (std.math.isPositiveInf(expected) and std.math.isPositiveInf(received)) {
-            return thisValue;
-        }
-
-        const expected_diff = std.math.pow(f64, 10, -precision) / 2;
-        const actual_diff = std.math.fabs(received - expected);
-        var pass = actual_diff < expected_diff;
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-        const expected_fmt = expected_.toFmt(globalObject, &formatter);
-        const received_fmt = received_.toFmt(globalObject, &formatter);
-
-        const expected_line = "Expected: <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const expected_precision = "Expected precision: {d}\n";
-        const expected_difference = "Expected difference: \\< <green>{d}<r>\n";
-        const received_difference = "Received difference: <red>{d}<r>\n";
-
-        const suffix_fmt = "\n\n" ++ expected_line ++ received_line ++ "\n" ++ expected_precision ++ expected_difference ++ received_difference;
-
-        if (not) {
-            const fmt = comptime getSignature("toBeCloseTo", "<green>expected<r>, precision", true) ++ suffix_fmt;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeCloseTo", "<green>expected<r>, precision", false) ++ suffix_fmt;
-
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, received_fmt, precision, expected_diff, actual_diff });
-        return .zero;
-    }
-
-    pub fn toBeOdd(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeOdd() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-
-        if (value.isBigInt32()) {
-            pass = value.toInt32() & 1 == 1;
-        } else if (value.isBigInt()) {
-            pass = value.toInt64() & 1 == 1;
-        } else if (value.isInt32()) {
-            const _value = value.toInt32();
-            pass = @mod(_value, 2) == 1;
-        } else if (value.isAnyInt()) {
-            const _value = value.toInt64();
-            pass = @mod(_value, 2) == 1;
-        } else if (value.isNumber()) {
-            const _value = JSValue.asNumber(value);
-            if (@mod(_value, 1) == 0 and @mod(_value, 2) == 1) { // if the fraction is all zeros and odd
-                pass = true;
-            } else {
-                pass = false;
-            }
-        } else {
-            pass = false;
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeOdd", "", true) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-            return .zero;
-        }
-
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeOdd", "", false) ++ "\n\n" ++ received_line;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{value_fmt});
-            return .zero;
-        }
-
-        globalObject.throw(Output.prettyFmt(fmt, false), .{value_fmt});
-        return .zero;
-    }
-
-    pub fn toThrow(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toThrow() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const expected_value: JSValue = if (arguments.len > 0) brk: {
-            const value = arguments[0];
-            if (value.isEmptyOrUndefinedOrNull() or !value.isObject() and !value.isString()) {
-                var fmt = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-                globalObject.throw("Expected value must be string or Error: {any}", .{value.toFmt(globalObject, &fmt)});
-                return .zero;
-            }
-            break :brk value;
-        } else .zero;
-        expected_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (!value.jsType().isFunction()) {
-            globalObject.throw("Expected value must be a function", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-
-        const result_: ?JSValue = brk: {
-            var vm = globalObject.bunVM();
-            var return_value: JSValue = .zero;
-            var scope = vm.unhandledRejectionScope();
-            var prev_unhandled_pending_rejection_to_capture = vm.unhandled_pending_rejection_to_capture;
-            vm.unhandled_pending_rejection_to_capture = &return_value;
-            vm.onUnhandledRejection = &VirtualMachine.onQuietUnhandledRejectionHandlerCaptureValue;
-            const return_value_from_fucntion: JSValue = value.call(globalObject, &.{});
-            vm.unhandled_pending_rejection_to_capture = prev_unhandled_pending_rejection_to_capture;
-
-            if (return_value == .zero) {
-                return_value = return_value_from_fucntion;
-            }
-
-            if (return_value.asAnyPromise()) |promise| {
-                globalObject.bunVM().waitForPromise(promise);
-                scope.apply(vm);
-                const promise_result = promise.result(globalObject.vm());
-
-                switch (promise.status(globalObject.vm())) {
-                    .Fulfilled => {
-                        break :brk null;
-                    },
-                    .Rejected => {
-                        // since we know for sure it rejected, we should always return the error
-                        break :brk promise_result.toError() orelse promise_result;
-                    },
-                    .Pending => unreachable,
-                }
-            }
-            scope.apply(vm);
-
-            break :brk return_value.toError();
-        };
-
-        const did_throw = result_ != null;
-
-        if (not) {
-            const signature = comptime getSignature("toThrow", "<green>expected<r>", true);
-
-            if (!did_throw) return thisValue;
-
-            const result: JSValue = result_.?;
-            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-            if (expected_value.isEmpty()) {
-                const signature_no_args = comptime getSignature("toThrow", "", true);
-                if (result.toError()) |err| {
-                    const name = err.get(globalObject, "name") orelse JSValue.undefined;
-                    const message = err.get(globalObject, "message") orelse JSValue.undefined;
-                    const fmt = signature_no_args ++ "\n\nError name: <red>{any}<r>\nError message: <red>{any}<r>\n";
-                    globalObject.throwPretty(fmt, .{
-                        name.toFmt(globalObject, &formatter),
-                        message.toFmt(globalObject, &formatter),
-                    });
-                    return .zero;
-                }
-
-                // non error thrown
-                const fmt = signature_no_args ++ "\n\nThrown value: <red>{any}<r>\n";
-                globalObject.throwPretty(fmt, .{result.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-
-            if (expected_value.isString()) {
-                const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
-
-                // TODO: remove this allocation
-                // partial match
-                {
-                    const expected_slice = expected_value.toSliceOrNull(globalObject) orelse return .zero;
-                    defer expected_slice.deinit();
-                    const received_slice = received_message.toSliceOrNull(globalObject) orelse return .zero;
-                    defer received_slice.deinit();
-                    if (!strings.contains(received_slice.slice(), expected_slice.slice())) return thisValue;
-                }
-
-                const fmt = signature ++ "\n\nExpected substring: not <green>{any}<r>\nReceived message: <red>{any}<r>\n";
-                globalObject.throwPretty(fmt, .{
-                    expected_value.toFmt(globalObject, &formatter),
-                    received_message.toFmt(globalObject, &formatter),
-                });
-                return .zero;
-            }
-
-            if (expected_value.isRegExp()) {
-                const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
-
-                // TODO: REMOVE THIS GETTER! Expose a binding to call .test on the RegExp object directly.
-                if (expected_value.get(globalObject, "test")) |test_fn| {
-                    const matches = test_fn.callWithThis(globalObject, expected_value, &.{received_message});
-                    if (!matches.toBooleanSlow(globalObject)) return thisValue;
-                }
-
-                const fmt = signature ++ "\n\nExpected pattern: not <green>{any}<r>\nReceived message: <red>{any}<r>\n";
-                globalObject.throwPretty(fmt, .{
-                    expected_value.toFmt(globalObject, &formatter),
-                    received_message.toFmt(globalObject, &formatter),
-                });
-                return .zero;
-            }
-
-            if (expected_value.get(globalObject, "message")) |expected_message| {
-                const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
-                // no partial match for this case
-                if (!expected_message.isSameValue(received_message, globalObject)) return thisValue;
-
-                const fmt = signature ++ "\n\nExpected message: not <green>{any}<r>\n";
-                globalObject.throwPretty(fmt, .{expected_message.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-
-            if (!result.isInstanceOf(globalObject, expected_value)) return thisValue;
-
-            var expected_class = ZigString.Empty;
-            expected_value.getClassName(globalObject, &expected_class);
-            const received_message = result.getIfPropertyExistsImpl(globalObject, "message", 7);
-            const fmt = signature ++ "\n\nExpected constructor: not <green>{s}<r>\n\nReceived message: <red>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_class, received_message.toFmt(globalObject, &formatter) });
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_class, received_message.toFmt(globalObject, &formatter) });
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toThrow", "<green>expected<r>", false);
-        if (did_throw) {
-            if (expected_value.isEmpty()) return thisValue;
-
-            const result: JSValue = if (result_.?.toError()) |r|
-                r
-            else
-                result_.?;
-
-            const _received_message: ?JSValue = if (result.isObject())
-                result.get(globalObject, "message")
-            else if (result.toStringOrNull(globalObject)) |js_str|
-                JSC.JSValue.fromCell(js_str)
-            else
-                null;
-
-            if (expected_value.isString()) {
-                if (_received_message) |received_message| {
-                    // TODO: remove this allocation
-                    // partial match
-                    const expected_slice = expected_value.toSliceOrNull(globalObject) orelse return .zero;
-                    defer expected_slice.deinit();
-                    const received_slice = received_message.toSlice(globalObject, globalObject.allocator());
-                    defer received_slice.deinit();
-                    if (strings.contains(received_slice.slice(), expected_slice.slice())) return thisValue;
-                }
-
-                // error: message from received error does not match expected string
-                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-                if (_received_message) |received_message| {
-                    const expected_value_fmt = expected_value.toFmt(globalObject, &formatter);
-                    const received_message_fmt = received_message.toFmt(globalObject, &formatter);
-                    const fmt = signature ++ "\n\n" ++ "Expected substring: <green>{any}<r>\nReceived message: <red>{any}<r>\n";
-                    globalObject.throwPretty(fmt, .{ expected_value_fmt, received_message_fmt });
-                    return .zero;
-                }
-
-                const expected_fmt = expected_value.toFmt(globalObject, &formatter);
-                const received_fmt = result.toFmt(globalObject, &formatter);
-                const fmt = signature ++ "\n\n" ++ "Expected substring: <green>{any}<r>\nReceived value: <red>{any}<r>";
-                globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
-
-                return .zero;
-            }
-
-            if (expected_value.isRegExp()) {
-                if (_received_message) |received_message| {
-                    // TODO: REMOVE THIS GETTER! Expose a binding to call .test on the RegExp object directly.
-                    if (expected_value.get(globalObject, "test")) |test_fn| {
-                        const matches = test_fn.callWithThis(globalObject, expected_value, &.{received_message});
-                        if (matches.toBooleanSlow(globalObject)) return thisValue;
-                    }
-                }
-
-                // error: message from received error does not match expected pattern
-                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-                if (_received_message) |received_message| {
-                    const expected_value_fmt = expected_value.toFmt(globalObject, &formatter);
-                    const received_message_fmt = received_message.toFmt(globalObject, &formatter);
-                    const fmt = signature ++ "\n\n" ++ "Expected pattern: <green>{any}<r>\nReceived message: <red>{any}<r>\n";
-                    globalObject.throwPretty(fmt, .{ expected_value_fmt, received_message_fmt });
-
-                    return .zero;
-                }
-
-                const expected_fmt = expected_value.toFmt(globalObject, &formatter);
-                const received_fmt = result.toFmt(globalObject, &formatter);
-                const fmt = signature ++ "\n\n" ++ "Expected pattern: <green>{any}<r>\nReceived value: <red>{any}<r>";
-                globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
-                return .zero;
-            }
-
-            // If it's not an object, we are going to crash here.
-            std.debug.assert(expected_value.isObject());
-
-            if (expected_value.get(globalObject, "message")) |expected_message| {
-                if (_received_message) |received_message| {
-                    if (received_message.isSameValue(expected_message, globalObject)) return thisValue;
-                }
-
-                // error: message from received error does not match expected error message.
-                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-                if (_received_message) |received_message| {
-                    const expected_fmt = expected_message.toFmt(globalObject, &formatter);
-                    const received_fmt = received_message.toFmt(globalObject, &formatter);
-                    const fmt = signature ++ "\n\nExpected message: <green>{any}<r>\nReceived message: <red>{any}<r>\n";
-                    globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
-                    return .zero;
-                }
-
-                const expected_fmt = expected_message.toFmt(globalObject, &formatter);
-                const received_fmt = result.toFmt(globalObject, &formatter);
-                const fmt = signature ++ "\n\nExpected message: <green>{any}<r>\nReceived value: <red>{any}<r>\n";
-                globalObject.throwPretty(fmt, .{ expected_fmt, received_fmt });
-                return .zero;
-            }
-
-            if (result.isInstanceOf(globalObject, expected_value)) return thisValue;
-
-            // error: received error not instance of received error constructor
-            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-            var expected_class = ZigString.Empty;
-            var received_class = ZigString.Empty;
-            expected_value.getClassName(globalObject, &expected_class);
-            result.getClassName(globalObject, &received_class);
-            const fmt = signature ++ "\n\nExpected constructor: <green>{s}<r>\nReceived constructor: <red>{s}<r>\n\n";
-
-            if (_received_message) |received_message| {
-                const message_fmt = fmt ++ "Received message: <red>{any}<r>\n";
-                const received_message_fmt = received_message.toFmt(globalObject, &formatter);
-
-                globalObject.throwPretty(message_fmt, .{
-                    expected_class,
-                    received_class,
-                    received_message_fmt,
-                });
-                return .zero;
-            }
-
-            const received_fmt = result.toFmt(globalObject, &formatter);
-            const value_fmt = fmt ++ "Received value: <red>{any}<r>\n";
-
-            globalObject.throwPretty(value_fmt, .{
-                expected_class,
-                received_class,
-                received_fmt,
-            });
-            return .zero;
-        }
-
-        // did not throw
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        const received_line = "Received function did not throw\n";
-
-        if (expected_value.isEmpty()) {
-            const fmt = comptime getSignature("toThrow", "", false) ++ "\n\n" ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{});
-            return .zero;
-        }
-
-        if (expected_value.isString()) {
-            const expected_fmt = "\n\nExpected substring: <green>{any}<r>\n\n" ++ received_line;
-            const fmt = signature ++ expected_fmt;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_value.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        if (expected_value.isRegExp()) {
-            const expected_fmt = "\n\nExpected pattern: <green>{any}<r>\n\n" ++ received_line;
-            const fmt = signature ++ expected_fmt;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_value.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        if (expected_value.get(globalObject, "message")) |expected_message| {
-            const expected_fmt = "\n\nExpected message: <green>{any}<r>\n\n" ++ received_line;
-            const fmt = signature ++ expected_fmt;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{expected_message.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{expected_message.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        const expected_fmt = "\n\nExpected constructor: <green>{s}<r>\n\n" ++ received_line;
-        var expected_class = ZigString.Empty;
-        expected_value.getClassName(globalObject, &expected_class);
-        const fmt = signature ++ expected_fmt;
-        if (Output.enable_ansi_colors) {
-            globalObject.throw(Output.prettyFmt(fmt, true), .{expected_class});
-            return .zero;
-        }
-        globalObject.throw(Output.prettyFmt(fmt, true), .{expected_class});
-        return .zero;
-    }
-
-    pub fn toMatchSnapshot(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(2);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toMatchSnapshot() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        if (not) {
-            const signature = comptime getSignature("toMatchSnapshot", "", true);
-            const fmt = signature ++ "\n\n<b>Matcher error<r>: Snapshot matchers cannot be used with <b>not<r>\n";
-            globalObject.throwPretty(fmt, .{});
-        }
-
-        var hint_string: ZigString = ZigString.Empty;
-        var property_matchers: ?JSValue = null;
-        switch (arguments.len) {
-            0 => {},
-            1 => {
-                if (arguments[0].isString()) {
-                    arguments[0].toZigString(&hint_string, globalObject);
-                } else if (arguments[0].isObject()) {
-                    property_matchers = arguments[0];
-                }
-            },
-            else => {
-                if (!arguments[0].isObject()) {
-                    const signature = comptime getSignature("toMatchSnapshot", "<green>properties<r><d>, <r>hint", false);
-                    const fmt = signature ++ "\n\nMatcher error: Expected <green>properties<r> must be an object\n";
-                    globalObject.throwPretty(fmt, .{});
-                    return .zero;
-                }
-
-                property_matchers = arguments[0];
-
-                if (arguments[1].isString()) {
-                    arguments[1].toZigString(&hint_string, globalObject);
-                }
-            },
-        }
-
-        var hint = hint_string.toSlice(default_allocator);
-        defer hint.deinit();
-
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        if (!value.isObject() and property_matchers != null) {
-            const signature = comptime getSignature("toMatchSnapshot", "<green>properties<r><d>, <r>hint", false);
-            const fmt = signature ++ "\n\n<b>Matcher error: <red>received<r> values must be an object when the matcher has <green>properties<r>\n";
-            globalObject.throwPretty(fmt, .{});
-            return .zero;
-        }
-
-        if (property_matchers) |_prop_matchers| {
-            var prop_matchers = _prop_matchers;
-
-            if (!value.jestDeepMatch(prop_matchers, globalObject, true)) {
-                // TODO: print diff with properties from propertyMatchers
-                const signature = comptime getSignature("toMatchSnapshot", "<green>propertyMatchers<r>", false);
-                const fmt = signature ++ "\n\nExpected <green>propertyMatchers<r> to match properties from received object" ++
-                    "\n\nReceived: {any}\n";
-
-                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject };
-                globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-        }
-
-        const result = Jest.runner.?.snapshots.getOrPut(this, value, hint.slice(), globalObject) catch |err| {
-            var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject };
-            const test_file_path = Jest.runner.?.files.get(this.scope.file_id).source.path.text;
-            switch (err) {
-                error.FailedToOpenSnapshotFile => globalObject.throw("Failed to open snapshot file for test file: {s}", .{test_file_path}),
-                error.FailedToMakeSnapshotDirectory => globalObject.throw("Failed to make snapshot directory for test file: {s}", .{test_file_path}),
-                error.FailedToWriteSnapshotFile => globalObject.throw("Failed write to snapshot file: {s}", .{test_file_path}),
-                error.ParseError => globalObject.throw("Failed to parse snapshot file for: {s}", .{test_file_path}),
-                else => globalObject.throw("Failed to snapshot value: {any}", .{value.toFmt(globalObject, &formatter)}),
-            }
-            return .zero;
-        };
-
-        if (result) |saved_value| {
-            var pretty_value: MutableString = MutableString.init(default_allocator, 0) catch unreachable;
-            value.jestSnapshotPrettyFormat(&pretty_value, globalObject) catch {
-                var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject };
-                globalObject.throw("Failed to pretty format value: {s}", .{value.toFmt(globalObject, &formatter)});
-                return .zero;
-            };
-            defer pretty_value.deinit();
-
-            if (strings.eqlLong(pretty_value.toOwnedSliceLeaky(), saved_value, true)) {
-                Jest.runner.?.snapshots.passed += 1;
-                return thisValue;
-            }
-
-            Jest.runner.?.snapshots.failed += 1;
-            const signature = comptime getSignature("toMatchSnapshot", "<green>expected<r>", false);
-            const fmt = signature ++ "\n\n{any}\n";
-            const diff_format = DiffFormatter{
-                .received_string = pretty_value.toOwnedSliceLeaky(),
-                .expected_string = saved_value,
-                .globalObject = globalObject,
-            };
-
-            globalObject.throwPretty(fmt, .{diff_format});
-            return .zero;
-        }
-
-        return thisValue;
-    }
-
-    pub fn toBeEmpty(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const value: JSValue = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeEmpty() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = false;
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-        const actual_length = value.getLengthIfPropertyExistsInternal(globalObject);
-
-        if (actual_length == std.math.inf(f64)) {
-            if (value.jsTypeLoose().isObject()) {
-                if (value.isIterable(globalObject)) {
-                    var any_properties_in_iterator = false;
-                    value.forEach(globalObject, &any_properties_in_iterator, struct {
-                        pub fn anythingInIterator(
-                            _: *JSC.VM,
-                            _: *JSGlobalObject,
-                            any_: ?*anyopaque,
-                            _: JSValue,
-                        ) callconv(.C) void {
-                            bun.cast(*bool, any_.?).* = true;
-                        }
-                    }.anythingInIterator);
-                    pass = !any_properties_in_iterator;
-                } else {
-                    var props_iter = JSC.JSPropertyIterator(.{
-                        .skip_empty_name = false,
-
-                        .include_value = true,
-                    }).init(globalObject, value.asObjectRef());
-                    defer props_iter.deinit();
-                    pass = props_iter.len == 0;
-                }
-            } else {
-                const signature = comptime getSignature("toBeEmpty", "", false);
-                const fmt = signature ++ "\n\nExpected value to be a string, object, or iterable" ++
-                    "\n\nReceived: <red>{any}<r>\n";
-                globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-        } else if (std.math.isNan(actual_length)) {
-            globalObject.throw("Received value has non-number length property: {}", .{actual_length});
-            return .zero;
-        } else {
-            pass = actual_length == 0;
-        }
-
-        if (not and pass) {
-            const signature = comptime getSignature("toBeEmpty", "", true);
-            const fmt = signature ++ "\n\nExpected value <b>not<r> to be a string, object, or iterable" ++
-                "\n\nReceived: <red>{any}<r>\n";
-            globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        if (not) {
-            const signature = comptime getSignature("toBeEmpty", "", true);
-            const fmt = signature ++ "\n\nExpected value <b>not<r> to be empty" ++
-                "\n\nReceived: <red>{any}<r>\n";
-            globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toBeEmpty", "", false);
-        const fmt = signature ++ "\n\nExpected value to be empty" ++
-            "\n\nReceived: <red>{any}<r>\n";
-        globalObject.throwPretty(fmt, .{value.toFmt(globalObject, &formatter)});
-        return .zero;
-    }
-
-    pub fn toBeNil(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeNil() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isUndefinedOrNull() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeNil", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeNil", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeArray(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeArray() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.jsType().isArray() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeArray", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeArray", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeArrayOfSize(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalThis.throwInvalidArguments("toBeArrayOfSize() requires 1 argument", .{});
-            return .zero;
-        }
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeArrayOfSize() must be called in a test", .{});
-            return .zero;
-        }
-
-        const size = arguments[0];
-        size.ensureStillAlive();
-
-        if (!size.isAnyInt()) {
-            globalThis.throw("toBeArrayOfSize() requires the first argument to be a number", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        var pass = value.jsType().isArray() and @intCast(i32, value.getLength(globalThis)) == size.toInt32();
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeArrayOfSize", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeArrayOfSize", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeBoolean(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeBoolean() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isBoolean() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeBoolean", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeBoolean", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeTrue(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeTrue() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = (value.isBoolean() and value.toBoolean()) != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeTrue", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeTrue", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeFalse(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeFalse() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = (value.isBoolean() and !value.toBoolean()) != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeFalse", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeFalse", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeNumber(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeNumber() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isNumber() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeNumber", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeNumber", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeInteger(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeInteger() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isAnyInt() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeInteger", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeInteger", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeFinite(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeFinite() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isNumber();
-        if (pass) {
-            const num: f64 = value.asNumber();
-            pass = std.math.isFinite(num) and !std.math.isNan(num);
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeFinite", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeFinite", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBePositive(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBePositive() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isNumber();
-        if (pass) {
-            const num: f64 = value.asNumber();
-            pass = @round(num) > 0 and !std.math.isInf(num) and !std.math.isNan(num);
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBePositive", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBePositive", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeNegative(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeNegative() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isNumber();
-        if (pass) {
-            const num: f64 = value.asNumber();
-            pass = @round(num) < 0 and !std.math.isInf(num) and !std.math.isNan(num);
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeNegative", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeNegative", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeTypeOf(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalThis.throwInvalidArguments("toBeTypeOf() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeTypeOf() must be called in a test", .{});
-            return .zero;
-        }
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        const expected = arguments[0];
-        expected.ensureStillAlive();
-
-        const expectedAsStr = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-        active_test_expectation_counter.actual += 1;
-
-        if (!expected.isString()) {
-            globalThis.throwInvalidArguments("toBeTypeOf() requires a string argument", .{});
-            return .zero;
-        }
-
-        if (!std.mem.eql(u8, expectedAsStr, "function") and
-            !std.mem.eql(u8, expectedAsStr, "object") and
-            !std.mem.eql(u8, expectedAsStr, "bigint") and
-            !std.mem.eql(u8, expectedAsStr, "boolean") and
-            !std.mem.eql(u8, expectedAsStr, "number") and
-            !std.mem.eql(u8, expectedAsStr, "string") and
-            !std.mem.eql(u8, expectedAsStr, "symbol") and
-            !std.mem.eql(u8, expectedAsStr, "undefined"))
-        {
-            globalThis.throwInvalidArguments("toBeTypeOf() requires a valid type string argument ('function', 'object', 'bigint', 'boolean', 'number', 'string', 'symbol', 'undefined')", .{});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass = false;
-        var whatIsTheType: []const u8 = "";
-
-        // Checking for function/class should be done before everything else, or it will fail.
-        if (value.isCallable(globalThis.vm())) {
-            whatIsTheType = "function";
-        } else if (value.isObject() or value.jsType().isArray() or value.isNull()) {
-            whatIsTheType = "object";
-        } else if (value.isBigInt()) {
-            whatIsTheType = "bigint";
-        } else if (value.isBoolean()) {
-            whatIsTheType = "boolean";
-        } else if (value.isNumber()) {
-            whatIsTheType = "number";
-        } else if (value.jsType().isString()) {
-            whatIsTheType = "string";
-        } else if (value.isSymbol()) {
-            whatIsTheType = "symbol";
-        } else if (value.isUndefined()) {
-            whatIsTheType = "undefined";
-        } else {
-            globalThis.throw("Internal consistency error: unknown JSValue type", .{});
-            return .zero;
-        }
-
-        pass = std.mem.eql(u8, expectedAsStr, whatIsTheType);
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-        const expected_str = expected.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeTypeOf", "", true) ++ "\n\n" ++ "Expected type: not <green>{any}<r>\n" ++ "Received type: <red>\"{s}\"<r>\nReceived value: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{ expected_str, whatIsTheType, received });
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeTypeOf", "", false) ++ "\n\n" ++ "Expected type: <green>{any}<r>\n" ++ "Received type: <red>\"{s}\"<r>\nReceived value: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{ expected_str, whatIsTheType, received });
-        return .zero;
-    }
-
-    pub fn toBeWithin(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(2);
-        const arguments = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalThis.throwInvalidArguments("toBeWithin() requires 2 arguments", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeWithin() must be called in a test", .{});
-            return .zero;
-        }
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        const startValue = arguments[0];
-        startValue.ensureStillAlive();
-
-        if (!startValue.isNumber()) {
-            globalThis.throw("toBeWithin() requires the first argument to be a number", .{});
-            return .zero;
-        }
-
-        const endValue = arguments[1];
-        endValue.ensureStillAlive();
-
-        if (!endValue.isNumber()) {
-            globalThis.throw("toBeWithin() requires the second argument to be a number", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isNumber();
-        if (pass) {
-            const num = value.asNumber();
-            pass = num >= startValue.asNumber() and num < endValue.asNumber();
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const start_fmt = startValue.toFmt(globalThis, &formatter);
-        const end_fmt = endValue.toFmt(globalThis, &formatter);
-        const received_fmt = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const expected_line = "Expected: not between <green>{any}<r> <d>(inclusive)<r> and <green>{any}<r> <d>(exclusive)<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeWithin", "<green>start<r><d>, <r><green>end<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            globalThis.throwPretty(fmt, .{ start_fmt, end_fmt, received_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected: between <green>{any}<r> <d>(inclusive)<r> and <green>{any}<r> <d>(exclusive)<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeWithin", "<green>start<r><d>, <r><green>end<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        globalThis.throwPretty(fmt, .{ start_fmt, end_fmt, received_fmt });
-        return .zero;
-    }
-
-    pub fn toBeSymbol(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeSymbol() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isSymbol() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeSymbol", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeSymbol", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeFunction(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeFunction() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isCallable(globalThis.vm()) != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeFunction", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeFunction", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeDate(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeDate() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isDate() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeDate", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeDate", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toBeString(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toBeString() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-        const pass = value.isString() != not;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const received = value.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const fmt = comptime getSignature("toBeString", "", true) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-            globalThis.throwPretty(fmt, .{received});
-            return .zero;
-        }
-
-        const fmt = comptime getSignature("toBeString", "", false) ++ "\n\n" ++ "Received: <red>{any}<r>\n";
-        globalThis.throwPretty(fmt, .{received});
-        return .zero;
-    }
-
-    pub fn toInclude(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const arguments_ = callFrame.arguments(1);
-        const arguments = arguments_.ptr[0..arguments_.len];
-
-        if (arguments.len < 1) {
-            globalThis.throwInvalidArguments("toInclude() requires 1 argument", .{});
-            return .zero;
-        }
-
-        const expected = arguments[0];
-        expected.ensureStillAlive();
-
-        if (!expected.isString()) {
-            globalThis.throw("toInclude() requires the first argument to be a string", .{});
-            return .zero;
-        }
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toInclude() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isString();
-        if (pass) {
-            const value_string = value.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-            const expected_string = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-            pass = strings.contains(value_string, expected_string) or expected_string.len == 0;
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const value_fmt = value.toFmt(globalThis, &formatter);
-        const expected_fmt = expected.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const expected_line = "Expected to not include: <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toInclude", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected to include: <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toInclude", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toStartWith(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const arguments_ = callFrame.arguments(1);
-        const arguments = arguments_.ptr[0..arguments_.len];
-
-        if (arguments.len < 1) {
-            globalThis.throwInvalidArguments("toStartWith() requires 1 argument", .{});
-            return .zero;
-        }
-
-        const expected = arguments[0];
-        expected.ensureStillAlive();
-
-        if (!expected.isString()) {
-            globalThis.throw("toStartWith() requires the first argument to be a string", .{});
-            return .zero;
-        }
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toStartWith() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isString();
-        if (pass) {
-            const value_string = value.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-            const expected_string = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-            pass = strings.startsWith(value_string, expected_string) or expected_string.len == 0;
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const value_fmt = value.toFmt(globalThis, &formatter);
-        const expected_fmt = expected.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const expected_line = "Expected to not start with: <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toStartWith", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected to start with: <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toStartWith", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toEndWith(this: *Expect, globalThis: *JSGlobalObject, callFrame: *CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalThis);
-
-        const thisValue = callFrame.this();
-        const arguments_ = callFrame.arguments(1);
-        const arguments = arguments_.ptr[0..arguments_.len];
-
-        if (arguments.len < 1) {
-            globalThis.throwInvalidArguments("toEndWith() requires 1 argument", .{});
-            return .zero;
-        }
-
-        const expected = arguments[0];
-        expected.ensureStillAlive();
-
-        if (!expected.isString()) {
-            globalThis.throw("toEndWith() requires the first argument to be a string", .{});
-            return .zero;
-        }
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalThis.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalThis.throw("toEndWith() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var pass = value.isString();
-        if (pass) {
-            const value_string = value.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-            const expected_string = expected.toString(globalThis).toSlice(globalThis, default_allocator).slice();
-            pass = strings.endsWith(value_string, expected_string) or expected_string.len == 0;
-        }
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-
-        if (pass) return thisValue;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalThis, .quote_strings = true };
-        const value_fmt = value.toFmt(globalThis, &formatter);
-        const expected_fmt = expected.toFmt(globalThis, &formatter);
-
-        if (not) {
-            const expected_line = "Expected to not end with: <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toEndWith", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected to end with: <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toEndWith", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        globalThis.throwPretty(fmt, .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toBeInstanceOf(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toBeInstanceOf() requires 1 argument", .{});
-            return .zero;
-        }
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toBeInstanceOf() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-        const expected_value = arguments[0];
-        if (!expected_value.isConstructor()) {
-            globalObject.throw("Expected value must be a function: {any}", .{expected_value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-        expected_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        const not = this.op.contains(.not);
-        var pass = value.isInstanceOf(globalObject, expected_value);
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        const expected_fmt = expected_value.toFmt(globalObject, &formatter);
-        const value_fmt = value.toFmt(globalObject, &formatter);
-        if (not) {
-            const expected_line = "Expected constructor: not <green>{any}<r>\n";
-            const received_line = "Received value: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toBeInstanceOf", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{ expected_fmt, value_fmt });
-                return .zero;
-            }
-
-            globalObject.throw(Output.prettyFmt(fmt, false), .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected constructor: <green>{any}<r>\n";
-        const received_line = "Received value: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toBeInstanceOf", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        globalObject.throwPretty(fmt, .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toMatch(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        JSC.markBinding(@src());
-
-        defer this.postMatch(globalObject);
-
-        const thisValue = callFrame.this();
-        const _arguments = callFrame.arguments(1);
-        const arguments: []const JSValue = _arguments.ptr[0.._arguments.len];
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toMatch() must be called in a test", .{});
-            return .zero;
-        }
-
-        if (arguments.len < 1) {
-            globalObject.throwInvalidArguments("toMatch() requires 1 argument", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-
-        const expected_value = arguments[0];
-        if (!expected_value.isString() and !expected_value.isRegExp()) {
-            globalObject.throw("Expected value must be a string or regular expression: {any}", .{expected_value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-        expected_value.ensureStillAlive();
-
-        const value = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-        value.ensureStillAlive();
-
-        if (!value.isString()) {
-            globalObject.throw("Received value must be a string: {any}", .{value.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        const not = this.op.contains(.not);
-        var pass: bool = brk: {
-            if (expected_value.isString()) {
-                break :brk value.stringIncludes(globalObject, expected_value);
-            } else if (expected_value.isRegExp()) {
-                break :brk expected_value.toMatch(globalObject, value);
-            }
-            unreachable;
-        };
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        const expected_fmt = expected_value.toFmt(globalObject, &formatter);
-        const value_fmt = value.toFmt(globalObject, &formatter);
-
-        if (not) {
-            const expected_line = "Expected substring or pattern: not <green>{any}<r>\n";
-            const received_line = "Received: <red>{any}<r>\n";
-            const fmt = comptime getSignature("toMatch", "<green>expected<r>", true) ++ "\n\n" ++ expected_line ++ received_line;
-            globalObject.throwPretty(fmt, .{ expected_fmt, value_fmt });
-            return .zero;
-        }
-
-        const expected_line = "Expected substring or pattern: <green>{any}<r>\n";
-        const received_line = "Received: <red>{any}<r>\n";
-        const fmt = comptime getSignature("toMatch", "<green>expected<r>", false) ++ "\n\n" ++ expected_line ++ received_line;
-        globalObject.throwPretty(fmt, .{ expected_fmt, value_fmt });
-        return .zero;
-    }
-
-    pub fn toHaveBeenCalled(this: *Expect, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        JSC.markBinding(@src());
-        const thisValue = callframe.this();
-        defer this.postMatch(globalObject);
-
-        const value: JSValue = JSC.Jest.Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        const calls = JSMockFunction__getCalls(value);
-        active_test_expectation_counter.actual += 1;
-
-        if (calls == .zero or !calls.jsType().isArray()) {
-            globalObject.throw("Expected value must be a mock function: {}", .{value});
-            return .zero;
-        }
-
-        var pass = calls.getLength(globalObject) > 0;
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        if (not) {
-            const signature = comptime getSignature("toHaveBeenCalled", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\nExpected: not <green>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
-            return .zero;
-        } else {
-            const signature = comptime getSignature("toHaveBeenCalled", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\nExpected <green>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        unreachable;
-    }
-    pub fn toHaveBeenCalledTimes(this: *Expect, globalObject: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        JSC.markBinding(@src());
-
-        const thisValue = callframe.this();
-        const arguments_ = callframe.arguments(1);
-        const arguments: []const JSValue = arguments_.ptr[0..arguments_.len];
-        defer this.postMatch(globalObject);
-        const value: JSValue = JSC.Jest.Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        active_test_expectation_counter.actual += 1;
-
-        const calls = JSMockFunction__getCalls(value);
-
-        if (calls == .zero or !calls.jsType().isArray()) {
-            globalObject.throw("Expected value must be a mock function: {}", .{value});
-            return .zero;
-        }
-
-        if (arguments.len < 1 or !arguments[0].isAnyInt()) {
-            globalObject.throwInvalidArguments("toHaveBeenCalledTimes() requires 1 integer argument", .{});
-            return .zero;
-        }
-
-        const times = arguments[0].coerce(i32, globalObject);
-
-        var pass = @intCast(i32, calls.getLength(globalObject)) == times;
-
-        const not = this.op.contains(.not);
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        var formatter = JSC.ZigConsoleClient.Formatter{ .globalThis = globalObject, .quote_strings = true };
-        if (not) {
-            const signature = comptime getSignature("toHaveBeenCalled", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\nExpected: not <green>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
-            return .zero;
-        } else {
-            const signature = comptime getSignature("toHaveBeenCalled", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\nExpected <green>{any}<r>\n";
-            if (Output.enable_ansi_colors) {
-                globalObject.throw(Output.prettyFmt(fmt, true), .{calls.toFmt(globalObject, &formatter)});
-                return .zero;
-            }
-            globalObject.throw(Output.prettyFmt(fmt, false), .{calls.toFmt(globalObject, &formatter)});
-            return .zero;
-        }
-
-        unreachable;
-    }
-
-    pub fn toMatchObject(this: *Expect, globalObject: *JSC.JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        JSC.markBinding(@src());
-
-        defer this.postMatch(globalObject);
-        const thisValue = callFrame.this();
-        const args = callFrame.arguments(1).slice();
-
-        if (this.scope.tests.items.len <= this.test_id) {
-            globalObject.throw("toMatchObject() must be called in a test", .{});
-            return .zero;
-        }
-
-        active_test_expectation_counter.actual += 1;
-
-        const not = this.op.contains(.not);
-
-        const received_object = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        if (!received_object.isObject()) {
-            const matcher_error = "\n\n<b>Matcher error<r>: <red>received<r> value must be a non-null object\n";
-            if (not) {
-                const fmt = comptime getSignature("toMatchObject", "<green>expected<r>", true) ++ matcher_error;
-                globalObject.throwPretty(fmt, .{});
-                return .zero;
-            }
-
-            const fmt = comptime getSignature("toMatchObject", "<green>expected<r>", false) ++ matcher_error;
-            globalObject.throwPretty(fmt, .{});
-            return .zero;
-        }
-
-        if (args.len < 1 or !args[0].isObject()) {
-            const matcher_error = "\n\n<b>Matcher error<r>: <green>expected<r> value must be a non-null object\n";
-            if (not) {
-                const fmt = comptime getSignature("toMatchObject", "<green>expected<r>", true) ++ matcher_error;
-                globalObject.throwPretty(fmt, .{});
-                return .zero;
-            }
-            const fmt = comptime getSignature("toMatchObject", "<green>expected<r>", false) ++ matcher_error;
-            globalObject.throwPretty(fmt, .{});
-            return .zero;
-        }
-
-        const property_matchers = args[0];
-
-        var pass = received_object.jestDeepMatch(property_matchers, globalObject, true);
-
-        if (not) pass = !pass;
-        if (pass) return thisValue;
-
-        // handle failure
-        const diff_formatter = DiffFormatter{
-            .received = received_object,
-            .expected = property_matchers,
-            .globalObject = globalObject,
-            .not = not,
-        };
-
-        if (not) {
-            const signature = comptime getSignature("toMatchObject", "<green>expected<r>", true);
-            const fmt = signature ++ "\n\n{any}\n";
-            globalObject.throwPretty(fmt, .{diff_formatter});
-            return .zero;
-        }
-
-        const signature = comptime getSignature("toMatchObject", "<green>expected<r>", false);
-        const fmt = signature ++ "\n\n{any}\n";
-        globalObject.throwPretty(fmt, .{diff_formatter});
-        return .zero;
-    }
-
-    pub const toHaveBeenCalledWith = notImplementedJSCFn;
-    pub const toHaveBeenLastCalledWith = notImplementedJSCFn;
-    pub const toHaveBeenNthCalledWith = notImplementedJSCFn;
-    pub const toHaveReturnedTimes = notImplementedJSCFn;
-    pub const toHaveReturnedWith = notImplementedJSCFn;
-    pub const toHaveLastReturnedWith = notImplementedJSCFn;
-    pub const toHaveNthReturnedWith = notImplementedJSCFn;
-    pub const toContainEqual = notImplementedJSCFn;
-    pub const toMatchInlineSnapshot = notImplementedJSCFn;
-    pub const toThrowErrorMatchingSnapshot = notImplementedJSCFn;
-    pub const toThrowErrorMatchingInlineSnapshot = notImplementedJSCFn;
-
-    pub const getStaticNot = notImplementedStaticProp;
-    pub const getStaticResolves = notImplementedStaticProp;
-    pub const getStaticRejects = notImplementedStaticProp;
-
-    pub fn getNot(this: *Expect, thisValue: JSValue, globalObject: *JSGlobalObject) callconv(.C) JSValue {
-        _ = Expect.capturedValueGetCached(thisValue) orelse {
-            globalObject.throw("Internal consistency error: the expect(value) was garbage collected but it should not have been!", .{});
-            return .zero;
-        };
-
-        this.op.toggle(.not);
-
-        return thisValue;
-    }
-
-    pub const getResolves = notImplementedJSCProp;
-    pub const getRejects = notImplementedJSCProp;
-
-    pub fn any(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        return ExpectAny.call(globalObject, callFrame);
-    }
-
-    pub fn anything(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        return ExpectAnything.call(globalObject, callFrame);
-    }
-
-    pub fn stringContaining(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        return ExpectStringContaining.call(globalObject, callFrame);
-    }
-
-    pub fn stringMatching(globalObject: *JSGlobalObject, callFrame: *JSC.CallFrame) callconv(.C) JSValue {
-        return ExpectStringMatching.call(globalObject, callFrame);
-    }
-
-    pub const extend = notImplementedStaticFn;
-    pub const arrayContaining = notImplementedStaticFn;
-    pub const assertions = notImplementedStaticFn;
-    pub const hasAssertions = notImplementedStaticFn;
-    pub const objectContaining = notImplementedStaticFn;
-    pub const addSnapshotSerializer = notImplementedStaticFn;
-
-    pub fn notImplementedJSCFn(_: *Expect, globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        globalObject.throw("Not implemented", .{});
-        return .zero;
-    }
-
-    pub fn notImplementedStaticFn(globalObject: *JSC.JSGlobalObject, _: *JSC.CallFrame) callconv(.C) JSC.JSValue {
-        globalObject.throw("Not implemented", .{});
-        return .zero;
-    }
-
-    pub fn notImplementedJSCProp(_: *Expect, _: JSC.JSValue, globalObject: *JSC.JSGlobalObject) callconv(.C) JSC.JSValue {
-        globalObject.throw("Not implemented", .{});
-        return .zero;
-    }
-
-    pub fn notImplementedStaticProp(globalObject: *JSC.JSGlobalObject, _: JSC.JSValue, _: JSC.JSValue) callconv(.C) JSC.JSValue {
-        globalObject.throw("Not implemented", .{});
-        return .zero;
-    }
-
-    pub fn postMatch(_: *Expect, globalObject: *JSC.JSGlobalObject) void {
-        var vm = globalObject.bunVM();
-        vm.autoGarbageCollect();
-    }
-};
-
 pub const TestScope = struct {
     label: string = "",
     parent: *DescribeScope,
@@ -4338,7 +584,7 @@ pub const TestScope = struct {
         const err = arguments.ptr[0];
         globalThis.bunVM().runErrorHandler(err, null);
         var task: *TestRunnerTask = arguments.ptr[1].asPromisePtr(TestRunnerTask);
-        task.handleResult(.{ .fail = active_test_expectation_counter.actual }, .promise);
+        task.handleResult(.{ .fail = expect.active_test_expectation_counter.actual }, .promise);
         globalThis.bunVM().autoGarbageCollect();
         return JSValue.jsUndefined();
     }
@@ -4346,7 +592,7 @@ pub const TestScope = struct {
     pub fn onResolve(globalThis: *JSC.JSGlobalObject, callframe: *JSC.CallFrame) callconv(.C) JSValue {
         const arguments = callframe.arguments(2);
         var task: *TestRunnerTask = arguments.ptr[1].asPromisePtr(TestRunnerTask);
-        task.handleResult(.{ .pass = active_test_expectation_counter.actual }, .promise);
+        task.handleResult(.{ .pass = expect.active_test_expectation_counter.actual }, .promise);
         globalThis.bunVM().autoGarbageCollect();
         return JSValue.jsUndefined();
     }
@@ -4365,13 +611,13 @@ pub const TestScope = struct {
             if (args.len > 0) {
                 const err = args.ptr[0];
                 if (err.isEmptyOrUndefinedOrNull()) {
-                    task.handleResult(.{ .pass = active_test_expectation_counter.actual }, .callback);
+                    task.handleResult(.{ .pass = expect.active_test_expectation_counter.actual }, .callback);
                 } else {
                     globalThis.bunVM().runErrorHandlerWithDedupe(err, null);
-                    task.handleResult(.{ .fail = active_test_expectation_counter.actual }, .callback);
+                    task.handleResult(.{ .fail = expect.active_test_expectation_counter.actual }, .callback);
                 }
             } else {
-                task.handleResult(.{ .pass = active_test_expectation_counter.actual }, .callback);
+                task.handleResult(.{ .pass = expect.active_test_expectation_counter.actual }, .callback);
             }
         }
 
@@ -4432,7 +678,7 @@ pub const TestScope = struct {
                 return .{ .todo = {} };
             }
 
-            return .{ .fail = active_test_expectation_counter.actual };
+            return .{ .fail = expect.active_test_expectation_counter.actual };
         }
 
         if (initial_value.asAnyPromise()) |promise| {
@@ -4459,7 +705,7 @@ pub const TestScope = struct {
                         return .{ .todo = {} };
                     }
 
-                    return .{ .fail = active_test_expectation_counter.actual };
+                    return .{ .fail = expect.active_test_expectation_counter.actual };
                 },
                 .Pending => {
                     task.promise_state = .pending;
@@ -4481,15 +727,15 @@ pub const TestScope = struct {
             return .{ .pending = {} };
         }
 
-        if (active_test_expectation_counter.expected > 0 and active_test_expectation_counter.expected < active_test_expectation_counter.actual) {
+        if (expect.active_test_expectation_counter.expected > 0 and expect.active_test_expectation_counter.expected < expect.active_test_expectation_counter.actual) {
             Output.prettyErrorln("Test fail: {d} / {d} expectations\n (make this better!)", .{
-                active_test_expectation_counter.actual,
-                active_test_expectation_counter.expected,
+                expect.active_test_expectation_counter.actual,
+                expect.active_test_expectation_counter.expected,
             });
-            return .{ .fail = active_test_expectation_counter.actual };
+            return .{ .fail = expect.active_test_expectation_counter.actual };
         }
 
-        return .{ .pass = active_test_expectation_counter.actual };
+        return .{ .pass = expect.active_test_expectation_counter.actual };
     }
 
     pub const name = "TestScope";
@@ -4534,17 +780,23 @@ pub const DescribeScope = struct {
     }
 
     pub fn push(new: *DescribeScope) void {
-        if (comptime is_bindgen) return undefined;
-        if (new == DescribeScope.active) return;
-
-        new.parent = DescribeScope.active;
+        if (comptime is_bindgen) return;
+        if (new.parent) |scope| {
+            if (comptime Environment.allow_assert) {
+                std.debug.assert(DescribeScope.active != new);
+                std.debug.assert(scope == DescribeScope.active);
+            }
+        } else if (DescribeScope.active) |scope| {
+            // calling Bun.jest() within (already active) module
+            if (scope.parent != null) return;
+        }
         DescribeScope.active = new;
     }
 
     pub fn pop(this: *DescribeScope) void {
-        if (comptime is_bindgen) return undefined;
-        if (DescribeScope.active == this)
-            DescribeScope.active = this.parent orelse DescribeScope.active;
+        if (comptime is_bindgen) return;
+        if (comptime Environment.allow_assert) std.debug.assert(DescribeScope.active == this);
+        DescribeScope.active = this.parent;
     }
 
     pub const LifecycleHook = enum {
@@ -4554,8 +806,7 @@ pub const DescribeScope = struct {
         afterAll,
     };
 
-    pub threadlocal var active: *DescribeScope = undefined;
-    pub threadlocal var module: *DescribeScope = undefined;
+    pub threadlocal var active: ?*DescribeScope = null;
 
     const CallbackFn = *const fn (
         *JSC.JSGlobalObject,
@@ -4564,21 +815,24 @@ pub const DescribeScope = struct {
 
     fn createCallback(comptime hook: LifecycleHook) CallbackFn {
         return struct {
-            const this_hook = hook;
             pub fn run(
                 globalThis: *JSC.JSGlobalObject,
                 callframe: *JSC.CallFrame,
             ) callconv(.C) JSC.JSValue {
-                const arguments_ = callframe.arguments(2);
-                const arguments = arguments_.ptr[0..arguments_.len];
-                if (arguments.len == 0 or !arguments[0].isObject() or !arguments[0].isCallable(globalThis.vm())) {
-                    globalThis.throwInvalidArgumentType(@tagName(this_hook), "callback", "function");
+                const arguments = callframe.arguments(2);
+                if (arguments.len < 1) {
+                    globalThis.throwNotEnoughArguments("callback", 1, arguments.len);
+                    return .zero;
+                }
+
+                const cb = arguments.ptr[0];
+                if (!cb.isObject() or !cb.isCallable(globalThis.vm())) {
+                    globalThis.throwInvalidArgumentType(@tagName(hook), "callback", "function");
                     return .zero;
                 }
 
-                arguments[0].protect();
-                const name = comptime @as(string, @tagName(this_hook));
-                @field(DescribeScope.active, name).append(getAllocator(globalThis), arguments[0]) catch unreachable;
+                cb.protect();
+                @field(DescribeScope.active.?, @tagName(hook)).append(getAllocator(globalThis), cb) catch unreachable;
                 return JSC.JSValue.jsBoolean(true);
             }
         }.run;
@@ -4612,11 +866,24 @@ pub const DescribeScope = struct {
     pub const beforeAll = createCallback(.beforeAll);
     pub const beforeEach = createCallback(.beforeEach);
 
-    pub fn execCallback(this: *DescribeScope, globalObject: *JSC.JSGlobalObject, comptime hook: LifecycleHook) JSValue {
-        const name = comptime @as(string, @tagName(hook));
-        var hooks: []JSC.JSValue = @field(this, name).items;
-        for (hooks, 0..) |cb, i| {
-            if (cb.isEmpty()) continue;
+    pub fn execCallback(this: *DescribeScope, globalObject: *JSC.JSGlobalObject, comptime hook: LifecycleHook) ?JSValue {
+        var hooks = &@field(this, @tagName(hook));
+        defer {
+            if (comptime hook == .beforeAll or hook == .afterAll) {
+                hooks.clearAndFree(getAllocator(globalObject));
+            }
+        }
+
+        for (hooks.items) |cb| {
+            if (comptime Environment.allow_assert) {
+                std.debug.assert(cb.isObject());
+                std.debug.assert(cb.isCallable(globalObject.vm()));
+            }
+            defer {
+                if (comptime hook == .beforeAll or hook == .afterAll) {
+                    cb.unprotect();
+                }
+            }
 
             const pending_test = Jest.runner.?.pending_test;
             // forbid `expect()` within hooks
@@ -4626,20 +893,23 @@ pub const DescribeScope = struct {
             Jest.runner.?.did_pending_test_fail = false;
 
             const vm = VirtualMachine.get();
-            var result: JSC.JSValue = if (cb.getLength(globalObject) > 0) brk: {
-                this.done = false;
-                const done_func = JSC.NewFunctionWithData(
-                    globalObject,
-                    ZigString.static("done"),
-                    0,
-                    DescribeScope.onDone,
-                    false,
-                    this,
-                );
-                var result = cb.call(globalObject, &.{done_func});
-                vm.waitFor(&this.done);
-                break :brk result;
-            } else cb.call(globalObject, &.{});
+            var result: JSC.JSValue = switch (cb.getLength(globalObject)) {
+                0 => cb.call(globalObject, &.{}),
+                else => brk: {
+                    this.done = false;
+                    const done_func = JSC.NewFunctionWithData(
+                        globalObject,
+                        ZigString.static("done"),
+                        0,
+                        DescribeScope.onDone,
+                        false,
+                        this,
+                    );
+                    var result = cb.call(globalObject, &.{done_func});
+                    vm.waitFor(&this.done);
+                    break :brk result;
+                },
+            };
             if (result.asAnyPromise()) |promise| {
                 if (promise.status(globalObject.vm()) == .Pending) {
                     result.protect();
@@ -4653,19 +923,30 @@ pub const DescribeScope = struct {
             Jest.runner.?.pending_test = pending_test;
             Jest.runner.?.did_pending_test_fail = orig_did_pending_test_fail;
             if (result.isAnyError()) return result;
-
-            if (comptime hook == .beforeAll or hook == .afterAll) {
-                hooks[i] = JSC.JSValue.zero;
-            }
         }
 
-        return JSValue.zero;
+        return null;
     }
 
     pub fn runGlobalCallbacks(globalThis: *JSC.JSGlobalObject, comptime hook: LifecycleHook) ?JSValue {
         // global callbacks
-        for (@field(Jest.runner.?.global_callbacks, @tagName(hook)).items) |cb| {
-            if (cb.isEmpty()) continue;
+        var hooks = &@field(Jest.runner.?.global_callbacks, @tagName(hook));
+        defer {
+            if (comptime hook == .beforeAll or hook == .afterAll) {
+                hooks.clearAndFree(getAllocator(globalThis));
+            }
+        }
+
+        for (hooks.items) |cb| {
+            if (comptime Environment.allow_assert) {
+                std.debug.assert(cb.isObject());
+                std.debug.assert(cb.isCallable(globalThis.vm()));
+            }
+            defer {
+                if (comptime hook == .beforeAll or hook == .afterAll) {
+                    cb.unprotect();
+                }
+            }
 
             const pending_test = Jest.runner.?.pending_test;
             // forbid `expect()` within hooks
@@ -4692,28 +973,40 @@ pub const DescribeScope = struct {
             if (result.isAnyError()) return result;
         }
 
-        if (comptime hook == .beforeAll or hook == .afterAll) {
-            @field(Jest.runner.?.global_callbacks, @tagName(hook)).items.len = 0;
-        }
-
         return null;
     }
 
-    pub fn runCallback(this: *DescribeScope, globalObject: *JSC.JSGlobalObject, comptime hook: LifecycleHook) JSValue {
+    fn runBeforeCallbacks(this: *DescribeScope, globalObject: *JSC.JSGlobalObject, comptime hook: LifecycleHook) ?JSValue {
+        if (this.parent) |scope| {
+            if (scope.runBeforeCallbacks(globalObject, hook)) |err| {
+                return err;
+            }
+        }
+        return this.execCallback(globalObject, hook);
+    }
+
+    pub fn runCallback(this: *DescribeScope, globalObject: *JSC.JSGlobalObject, comptime hook: LifecycleHook) ?JSValue {
+        if (comptime hook == .afterAll or hook == .afterEach) {
+            var parent: ?*DescribeScope = this;
+            while (parent) |scope| {
+                if (scope.execCallback(globalObject, hook)) |err| {
+                    return err;
+                }
+                parent = scope.parent;
+            }
+        }
+
         if (runGlobalCallbacks(globalObject, hook)) |err| {
             return err;
         }
 
-        var parent = this.parent;
-        while (parent) |scope| {
-            const ret = scope.execCallback(globalObject, hook);
-            if (!ret.isEmpty()) {
-                return ret;
+        if (comptime hook == .beforeAll or hook == .beforeEach) {
+            if (this.runBeforeCallbacks(globalObject, hook)) |err| {
+                return err;
             }
-            parent = scope.parent;
         }
 
-        return this.execCallback(globalObject, hook);
+        return null;
     }
 
     pub fn call(globalThis: *JSGlobalObject, callframe: *CallFrame) callconv(.C) JSValue {
@@ -4744,11 +1037,8 @@ pub const DescribeScope = struct {
         if (comptime is_bindgen) return undefined;
         callback.protect();
         defer callback.unprotect();
-        var original_active = active;
-        defer active = original_active;
-        if (this != module)
-            this.parent = this.parent orelse active;
-        active = this;
+        this.push();
+        defer this.pop();
 
         if (callback == .zero) {
             this.runTests(globalObject);
@@ -4802,8 +1092,7 @@ pub const DescribeScope = struct {
         var i: TestRunner.Test.ID = 0;
 
         if (!this.isAllSkipped()) {
-            const beforeAllCallback = this.runCallback(globalObject, .beforeAll);
-            if (!beforeAllCallback.isEmpty()) {
+            if (this.runCallback(globalObject, .beforeAll)) |_| {
                 while (i < end) {
                     Jest.runner.?.reportFailure(i + this.test_id_start, source.path.text, tests[i].label, 0, 0, this);
                     i += 1;
@@ -4834,9 +1123,8 @@ pub const DescribeScope = struct {
         this.pending_tests.unset(test_id);
 
         if (!skipped) {
-            const afterEach_result = this.runCallback(globalThis, .afterEach);
-            if (!afterEach_result.isEmpty()) {
-                globalThis.bunVM().runErrorHandler(afterEach_result, null);
+            if (this.runCallback(globalThis, .afterEach)) |err| {
+                globalThis.bunVM().runErrorHandler(err, null);
             }
         }
 
@@ -4847,9 +1135,8 @@ pub const DescribeScope = struct {
         if (!this.isAllSkipped()) {
             // Run the afterAll callbacks, in reverse order
             // unless there were no tests for this scope
-            const afterAll_result = this.execCallback(globalThis, .afterAll);
-            if (!afterAll_result.isEmpty()) {
-                globalThis.bunVM().runErrorHandler(afterAll_result, null);
+            if (this.execCallback(globalThis, .afterAll)) |err| {
+                globalThis.bunVM().runErrorHandler(err, null);
             }
         }
 
@@ -4876,8 +1163,6 @@ pub const DescribeScope = struct {
 
 };
 
-var active_test_expectation_counter: TestScope.Counter = undefined;
-
 pub const TestRunnerTask = struct {
     test_id: TestRunner.Test.ID,
     describe: *DescribeScope,
@@ -4920,7 +1205,7 @@ pub const TestRunnerTask = struct {
         if (jsc_vm.onUnhandledRejectionCtx) |ctx| {
             var this = bun.cast(*TestRunnerTask, ctx);
             jsc_vm.onUnhandledRejectionCtx = null;
-            this.handleResult(.{ .fail = active_test_expectation_counter.actual }, .unhandledRejection);
+            this.handleResult(.{ .fail = expect.active_test_expectation_counter.actual }, .unhandledRejection);
         }
     }
 
@@ -4931,8 +1216,7 @@ pub const TestRunnerTask = struct {
 
         // reset the global state for each test
         // prior to the run
-        DescribeScope.active = describe;
-        active_test_expectation_counter = .{};
+        expect.active_test_expectation_counter = .{};
         jsc_vm.last_reported_error_for_dedupe = .zero;
 
         const test_id = this.test_id;
@@ -4963,11 +1247,9 @@ pub const TestRunnerTask = struct {
             this.needs_before_each = false;
             const label = test_.label;
 
-            const beforeEach = this.describe.runCallback(globalThis, .beforeEach);
-
-            if (!beforeEach.isEmpty()) {
+            if (this.describe.runCallback(globalThis, .beforeEach)) |err| {
                 Jest.runner.?.reportFailure(test_id, this.source_file_path, label, 0, 0, this.describe);
-                jsc_vm.runErrorHandler(beforeEach, null);
+                jsc_vm.runErrorHandler(err, null);
                 return false;
             }
         }
@@ -4995,11 +1277,11 @@ pub const TestRunnerTask = struct {
     }
 
     pub fn timeout(this: *TestRunnerTask) void {
-        std.debug.assert(!this.reported);
+        if (comptime Environment.allow_assert) std.debug.assert(!this.reported);
 
         this.ref.unref(this.globalThis.bunVM());
         this.globalThis.throwTerminationException();
-        this.handleResult(.{ .fail = active_test_expectation_counter.actual }, .timeout);
+        this.handleResult(.{ .fail = expect.active_test_expectation_counter.actual }, .timeout);
     }
 
     pub fn handleResult(this: *TestRunnerTask, result: Result, comptime from: @Type(.EnumLiteral)) void {
@@ -5008,7 +1290,7 @@ pub const TestRunnerTask = struct {
 
         switch (comptime from) {
             .promise => {
-                std.debug.assert(this.promise_state == .pending);
+                if (comptime Environment.allow_assert) std.debug.assert(this.promise_state == .pending);
                 this.promise_state = .fulfilled;
 
                 if (this.done_callback_state == .pending and result == .pass) {
@@ -5016,7 +1298,7 @@ pub const TestRunnerTask = struct {
                 }
             },
             .callback => {
-                std.debug.assert(this.done_callback_state == .pending);
+                if (comptime Environment.allow_assert) std.debug.assert(this.done_callback_state == .pending);
                 this.done_callback_state = .fulfilled;
 
                 if (this.promise_state == .pending and result == .pass) {
@@ -5024,7 +1306,7 @@ pub const TestRunnerTask = struct {
                 }
             },
             .sync => {
-                std.debug.assert(this.sync_state == .pending);
+                if (comptime Environment.allow_assert) std.debug.assert(this.sync_state == .pending);
                 this.sync_state = .fulfilled;
             },
             .timeout, .unhandledRejection => {},
@@ -5202,7 +1484,7 @@ inline fn createScope(
         return .zero;
     }
 
-    const parent = DescribeScope.active;
+    const parent = DescribeScope.active.?;
     const allocator = getAllocator(globalThis);
     const label = if (description == .zero)
         ""
@@ -5298,7 +1580,7 @@ pub fn printGithubAnnotation(exception: *JSC.ZigException) void {
 
     if (top_frame) |frame| {
         if (!frame.position.isInvalid()) {
-            const source_url = frame.source_url.toSlice(allocator);
+            const source_url = frame.source_url.toUTF8(allocator);
             defer source_url.deinit();
             const file = bun.path.relative(dir, source_url.slice());
             Output.printError("\n::error file={s},line={d},col={d},title=", .{
@@ -5314,14 +1596,14 @@ pub fn printGithubAnnotation(exception: *JSC.ZigException) void {
         Output.printError("\n::error title=", .{});
     }
 
-    if (name.len == 0 or name.eqlComptime("Error")) {
+    if (name.isEmpty() or name.eqlComptime("Error")) {
         Output.printError("error", .{});
     } else {
         Output.printError("{s}", .{name.githubAction()});
     }
 
-    if (message.len > 0) {
-        const message_slice = message.toSlice(allocator);
+    if (!message.isEmpty()) {
+        const message_slice = message.toUTF8(allocator);
         defer message_slice.deinit();
         const msg = message_slice.slice();
 
@@ -5329,7 +1611,7 @@ pub fn printGithubAnnotation(exception: *JSC.ZigException) void {
         while (strings.indexOfNewlineOrNonASCIIOrANSI(msg, cursor)) |i| {
             cursor = i + 1;
             if (msg[i] == '\n') {
-                const first_line = ZigString.init(msg[0..i]);
+                const first_line = bun.String.fromUTF8(msg[0..i]);
                 Output.printError(": {s}::", .{first_line.githubAction()});
                 break;
             }
@@ -5360,10 +1642,10 @@ pub fn printGithubAnnotation(exception: *JSC.ZigException) void {
         var i: i16 = 0;
         while (i < frames.len) : (i += 1) {
             const frame = frames[@intCast(usize, i)];
-            const source_url = frame.source_url.toSlice(allocator);
+            const source_url = frame.source_url.toUTF8(allocator);
             defer source_url.deinit();
             const file = bun.path.relative(dir, source_url.slice());
-            const func = frame.function_name.toSlice(allocator);
+            const func = frame.function_name.toUTF8(allocator);
 
             if (file.len == 0 and func.len == 0) continue;
 
@@ -5404,11 +1686,3 @@ pub fn printGithubAnnotation(exception: *JSC.ZigException) void {
     Output.printError("\n", .{});
     Output.flush();
 }
-
-/// JSValue.zero is used to indicate it was not a JSMockFunction
-/// If there were no calls, it returns an empty JSArray*
-extern fn JSMockFunction__getCalls(JSValue) JSValue;
-
-/// JSValue.zero is used to indicate it was not a JSMockFunction
-/// If there were no calls, it returns an empty JSArray*
-extern fn JSMockFunction__getReturns(JSValue) JSValue;
diff --git a/src/bun.js/test/pretty_format.zig b/src/bun.js/test/pretty_format.zig
index 15ab88799..a6c6aa631 100644
--- a/src/bun.js/test/pretty_format.zig
+++ b/src/bun.js/test/pretty_format.zig
@@ -15,6 +15,7 @@ const JSPrinter = bun.js_printer;
 const JSPrivateDataPtr = JSC.JSPrivateDataPtr;
 const JS = @import("../javascript.zig");
 const JSPromise = JSC.JSPromise;
+const expect = @import("./expect.zig");
 
 pub const EventType = enum(u8) {
     Event,
@@ -362,7 +363,7 @@ pub const JestPrettyFormat = struct {
             };
 
             pub fn get(value: JSValue, globalThis: *JSGlobalObject) Result {
-                switch (@enumToInt(value)) {
+                switch (@intFromEnum(value)) {
                     0, 0xa => return Result{
                         .tag = .Undefined,
                     },
@@ -422,23 +423,20 @@ pub const JestPrettyFormat = struct {
 
                 // If we check an Object has a method table and it does not
                 // it will crash
-                const callable = js_type != .Object and value.isCallable(globalThis.vm());
+                if (js_type != .Object and value.isCallable(globalThis.vm())) {
+                    if (value.isClass(globalThis)) {
+                        return .{
+                            .tag = .Class,
+                            .cell = js_type,
+                        };
+                    }
 
-                if (value.isClass(globalThis) and !callable) {
                     return .{
-                        .tag = .Object,
-                        .cell = js_type,
-                    };
-                }
-
-                if (callable and js_type == .JSFunction) {
-                    return .{
-                        .tag = .Function,
-                        .cell = js_type,
-                    };
-                } else if (callable and js_type == .InternalFunction) {
-                    return .{
-                        .tag = .Object,
+                        // TODO: we print InternalFunction as Object because we have a lot of
+                        // callable namespaces and printing the contents of it is better than [Function: namespace]
+                        // ideally, we would print [Function: namespace] { ... } on all functions, internal and js.
+                        // what we'll do later is rid of .Function and .Class and handle the prefix in the .Object formatter
+                        .tag = if (js_type == .InternalFunction) .Object else .Function,
                         .cell = js_type,
                     };
                 }
@@ -749,7 +747,7 @@ pub const JestPrettyFormat = struct {
                 parent: JSValue,
                 const enable_ansi_colors = enable_ansi_colors_;
                 pub fn handleFirstProperty(this: *@This(), globalThis: *JSC.JSGlobalObject, value: JSValue) void {
-                    if (!value.jsType().isFunction() and !value.isClass(globalThis)) {
+                    if (!value.jsType().isFunction()) {
                         var writer = WrappedWriter(Writer){
                             .ctx = this.writer,
                             .failed = false,
@@ -921,7 +919,7 @@ pub const JestPrettyFormat = struct {
                     this.map = this.map_node.?.data;
                 }
 
-                var entry = this.map.getOrPut(@enumToInt(value)) catch unreachable;
+                var entry = this.map.getOrPut(@intFromEnum(value)) catch unreachable;
                 if (entry.found_existing) {
                     writer.writeAll(comptime Output.prettyFmt("<r><cyan>[Circular]<r>", enable_ansi_colors));
                     return;
@@ -930,7 +928,7 @@ pub const JestPrettyFormat = struct {
 
             defer {
                 if (comptime Format.canHaveCircularReferences()) {
-                    _ = this.map.remove(@enumToInt(value));
+                    _ = this.map.remove(@intFromEnum(value));
                 }
             }
 
@@ -1049,7 +1047,7 @@ pub const JestPrettyFormat = struct {
                             i = -i;
                         }
                         const digits = if (i != 0)
-                            bun.fmt.fastDigitCount(@intCast(usize, i)) + @as(usize, @boolToInt(is_negative))
+                            bun.fmt.fastDigitCount(@intCast(usize, i)) + @as(usize, @intFromBool(is_negative))
                         else
                             1;
                         this.addForNewLine(digits);
@@ -1125,13 +1123,20 @@ pub const JestPrettyFormat = struct {
                     this.addForNewLine(printable.len);
 
                     if (printable.len == 0) {
-                        writer.print(comptime Output.prettyFmt("[class]", enable_ansi_colors), .{});
+                        writer.print(comptime Output.prettyFmt("<cyan>[class]<r>", enable_ansi_colors), .{});
                     } else {
-                        writer.print(comptime Output.prettyFmt("[class <cyan>{}<r>]", enable_ansi_colors), .{printable});
+                        writer.print(comptime Output.prettyFmt("<cyan>[class {}]<r>", enable_ansi_colors), .{printable});
                     }
                 },
                 .Function => {
-                    writer.writeAll("[Function]");
+                    var printable = ZigString.init(&name_buf);
+                    value.getNameProperty(this.globalThis, &printable);
+
+                    if (printable.len == 0) {
+                        writer.print(comptime Output.prettyFmt("<cyan>[Function]<r>", enable_ansi_colors), .{});
+                    } else {
+                        writer.print(comptime Output.prettyFmt("<cyan>[Function: {}]<r>", enable_ansi_colors), .{printable});
+                    }
                 },
                 .Array => {
                     const len = @truncate(u32, value.getLength(this.globalThis));
@@ -1264,12 +1269,12 @@ pub const JestPrettyFormat = struct {
                     } else if (value.as(JSC.ResolveMessage)) |resolve_log| {
                         resolve_log.msg.writeFormat(writer_, enable_ansi_colors) catch {};
                         return;
-                    } else if (value.as(JSC.Jest.ExpectAnything) != null) {
+                    } else if (value.as(expect.ExpectAnything) != null) {
                         this.addForNewLine("Anything".len);
                         writer.writeAll("Anything");
                         return;
-                    } else if (value.as(JSC.Jest.ExpectAny) != null) {
-                        const constructor_value = JSC.Jest.ExpectAny.constructorValueGetCached(value) orelse return;
+                    } else if (value.as(expect.ExpectAny) != null) {
+                        const constructor_value = expect.ExpectAny.constructorValueGetCached(value) orelse return;
 
                         this.addForNewLine("Any<".len);
                         writer.writeAll("Any<");
@@ -1281,16 +1286,16 @@ pub const JestPrettyFormat = struct {
                         writer.writeAll(">");
 
                         return;
-                    } else if (value.as(JSC.Jest.ExpectStringContaining) != null) {
-                        const substring_value = JSC.Jest.ExpectStringContaining.stringValueGetCached(value) orelse return;
+                    } else if (value.as(expect.ExpectStringContaining) != null) {
+                        const substring_value = expect.ExpectStringContaining.stringValueGetCached(value) orelse return;
 
                         this.addForNewLine("StringContaining ".len);
                         writer.writeAll("StringContaining ");
                         this.printAs(.String, Writer, writer_, substring_value, .String, enable_ansi_colors);
 
                         return;
-                    } else if (value.as(JSC.Jest.ExpectStringMatching) != null) {
-                        const test_value = JSC.Jest.ExpectStringMatching.testValueGetCached(value) orelse return;
+                    } else if (value.as(expect.ExpectStringMatching) != null) {
+                        const test_value = expect.ExpectStringMatching.testValueGetCached(value) orelse return;
 
                         this.addForNewLine("StringMatching ".len);
                         writer.writeAll("StringMatching ");
@@ -1553,7 +1558,7 @@ pub const JestPrettyFormat = struct {
                             {
                                 this.indent += 1;
                                 defer this.indent -|= 1;
-                                const count_without_children = props_iter.len - @as(usize, @boolToInt(children_prop != null));
+                                const count_without_children = props_iter.len - @as(usize, @intFromBool(children_prop != null));
 
                                 while (props_iter.next()) |prop| {
                                     if (prop.eqlComptime("children"))
diff --git a/src/bun.js/test/snapshot.zig b/src/bun.js/test/snapshot.zig
new file mode 100644
index 000000000..12c7b3c36
--- /dev/null
+++ b/src/bun.js/test/snapshot.zig
@@ -0,0 +1,284 @@
+const std = @import("std");
+const bun = @import("root").bun;
+const default_allocator = bun.default_allocator;
+const string = bun.string;
+const MutableString = bun.MutableString;
+const strings = bun.strings;
+const logger = bun.logger;
+const jest = @import("./jest.zig");
+const Jest = jest.Jest;
+const TestRunner = jest.TestRunner;
+const js_parser = bun.js_parser;
+const js_ast = bun.JSAst;
+const JSC = bun.JSC;
+const JSValue = JSC.JSValue;
+const VirtualMachine = JSC.VirtualMachine;
+const Expect = @import("./expect.zig").Expect;
+
+pub const Snapshots = struct {
+    const file_header = "// Bun Snapshot v1, https://goo.gl/fbAQLP\n";
+    pub const ValuesHashMap = std.HashMap(usize, string, bun.IdentityContext(usize), std.hash_map.default_max_load_percentage);
+
+    allocator: std.mem.Allocator,
+    update_snapshots: bool,
+    total: usize = 0,
+    added: usize = 0,
+    passed: usize = 0,
+    failed: usize = 0,
+
+    file_buf: *std.ArrayList(u8),
+    values: *ValuesHashMap,
+    counts: *bun.StringHashMap(usize),
+    _current_file: ?File = null,
+    snapshot_dir_path: ?string = null,
+
+    const File = struct {
+        id: TestRunner.File.ID,
+        file: std.fs.File,
+    };
+
+    pub fn getOrPut(this: *Snapshots, expect: *Expect, value: JSValue, hint: string, globalObject: *JSC.JSGlobalObject) !?string {
+        switch (try this.getSnapshotFile(expect.scope.file_id)) {
+            .result => {},
+            .err => |err| {
+                return switch (err.syscall) {
+                    .mkdir => error.FailedToMakeSnapshotDirectory,
+                    .open => error.FailedToOpenSnapshotFile,
+                    else => error.SnapshotFailed,
+                };
+            },
+        }
+
+        const snapshot_name = try expect.getSnapshotName(this.allocator, hint);
+        this.total += 1;
+
+        var count_entry = try this.counts.getOrPut(snapshot_name);
+        const counter = brk: {
+            if (count_entry.found_existing) {
+                this.allocator.free(snapshot_name);
+                count_entry.value_ptr.* += 1;
+                break :brk count_entry.value_ptr.*;
+            }
+            count_entry.value_ptr.* = 1;
+            break :brk count_entry.value_ptr.*;
+        };
+
+        const name = count_entry.key_ptr.*;
+
+        var counter_string_buf = [_]u8{0} ** 32;
+        var counter_string = try std.fmt.bufPrint(&counter_string_buf, "{d}", .{counter});
+
+        var name_with_counter = try this.allocator.alloc(u8, name.len + 1 + counter_string.len);
+        defer this.allocator.free(name_with_counter);
+        bun.copy(u8, name_with_counter[0..name.len], name);
+        name_with_counter[name.len] = ' ';
+        bun.copy(u8, name_with_counter[name.len + 1 ..], counter_string);
+
+        const name_hash = bun.hash(name_with_counter);
+        if (this.values.get(name_hash)) |expected| {
+            return expected;
+        }
+
+        // doesn't exist. append to file bytes and add to hashmap.
+        var pretty_value = try MutableString.init(this.allocator, 0);
+        try value.jestSnapshotPrettyFormat(&pretty_value, globalObject);
+
+        const serialized_length = "\nexports[`".len + name_with_counter.len + "`] = `".len + pretty_value.list.items.len + "`;\n".len;
+        try this.file_buf.ensureUnusedCapacity(serialized_length);
+        this.file_buf.appendSliceAssumeCapacity("\nexports[`");
+        this.file_buf.appendSliceAssumeCapacity(name_with_counter);
+        this.file_buf.appendSliceAssumeCapacity("`] = `");
+        this.file_buf.appendSliceAssumeCapacity(pretty_value.list.items);
+        this.file_buf.appendSliceAssumeCapacity("`;\n");
+
+        this.added += 1;
+        try this.values.put(name_hash, pretty_value.toOwnedSlice());
+        return null;
+    }
+
+    pub fn parseFile(this: *Snapshots) !void {
+        if (this.file_buf.items.len == 0) return;
+
+        const vm = VirtualMachine.get();
+        var opts = js_parser.Parser.Options.init(vm.bundler.options.jsx, .js);
+        var temp_log = logger.Log.init(this.allocator);
+
+        const test_file = Jest.runner.?.files.get(this._current_file.?.id);
+        const test_filename = test_file.source.path.name.filename;
+        const dir_path = test_file.source.path.name.dirWithTrailingSlash();
+
+        var snapshot_file_path_buf: [bun.MAX_PATH_BYTES]u8 = undefined;
+        var remain: []u8 = snapshot_file_path_buf[0..bun.MAX_PATH_BYTES];
+        bun.copy(u8, remain, dir_path);
+        remain = remain[dir_path.len..];
+        bun.copy(u8, remain, "__snapshots__/");
+        remain = remain["__snapshots__/".len..];
+        bun.copy(u8, remain, test_filename);
+        remain = remain[test_filename.len..];
+        bun.copy(u8, remain, ".snap");
+        remain = remain[".snap".len..];
+        remain[0] = 0;
+        const snapshot_file_path = snapshot_file_path_buf[0 .. snapshot_file_path_buf.len - remain.len :0];
+
+        const source = logger.Source.initPathString(snapshot_file_path, this.file_buf.items);
+
+        var parser = try js_parser.Parser.init(
+            opts,
+            &temp_log,
+            &source,
+            vm.bundler.options.define,
+            this.allocator,
+        );
+
+        var parse_result = try parser.parse();
+        var ast = if (parse_result == .ast) parse_result.ast else return error.ParseError;
+        defer ast.deinit();
+
+        if (ast.exports_ref.isNull()) return;
+        const exports_ref = ast.exports_ref;
+
+        // TODO: when common js transform changes, keep this updated or add flag to support this version
+
+        const export_default = brk: {
+            for (ast.parts.slice()) |part| {
+                for (part.stmts) |stmt| {
+                    if (stmt.data == .s_export_default and stmt.data.s_export_default.value == .expr) {
+                        break :brk stmt.data.s_export_default.value.expr;
+                    }
+                }
+            }
+
+            return;
+        };
+
+        if (export_default.data == .e_call) {
+            const function_call = export_default.data.e_call;
+            if (function_call.args.len == 2 and function_call.args.ptr[0].data == .e_function) {
+                const arg_function_stmts = function_call.args.ptr[0].data.e_function.func.body.stmts;
+                for (arg_function_stmts) |stmt| {
+                    switch (stmt.data) {
+                        .s_expr => |expr| {
+                            if (expr.value.data == .e_binary and expr.value.data.e_binary.op == .bin_assign) {
+                                const left = expr.value.data.e_binary.left;
+                                if (left.data == .e_index and left.data.e_index.index.data == .e_string and left.data.e_index.target.data == .e_identifier) {
+                                    const target: js_ast.E.Identifier = left.data.e_index.target.data.e_identifier;
+                                    var index: *js_ast.E.String = left.data.e_index.index.data.e_string;
+                                    if (target.ref.eql(exports_ref) and expr.value.data.e_binary.right.data == .e_string) {
+                                        const key = index.slice(this.allocator);
+                                        var value_string = expr.value.data.e_binary.right.data.e_string;
+                                        const value = value_string.slice(this.allocator);
+                                        defer {
+                                            if (!index.isUTF8()) this.allocator.free(key);
+                                            if (!value_string.isUTF8()) this.allocator.free(value);
+                                        }
+                                        const value_clone = try this.allocator.alloc(u8, value.len);
+                                        bun.copy(u8, value_clone, value);
+                                        const name_hash = bun.hash(key);
+                                        try this.values.put(name_hash, value_clone);
+                                    }
+                                }
+                            }
+                        },
+                        else => {},
+                    }
+                }
+            }
+        }
+    }
+
+    pub fn writeSnapshotFile(this: *Snapshots) !void {
+        if (this._current_file) |_file| {
+            var file = _file;
+            file.file.writeAll(this.file_buf.items) catch {
+                return error.FailedToWriteSnapshotFile;
+            };
+            file.file.close();
+            this.file_buf.clearAndFree();
+
+            var value_itr = this.values.valueIterator();
+            while (value_itr.next()) |value| {
+                this.allocator.free(value.*);
+            }
+            this.values.clearAndFree();
+
+            var count_key_itr = this.counts.keyIterator();
+            while (count_key_itr.next()) |key| {
+                this.allocator.free(key.*);
+            }
+            this.counts.clearAndFree();
+        }
+    }
+
+    fn getSnapshotFile(this: *Snapshots, file_id: TestRunner.File.ID) !JSC.Maybe(void) {
+        if (this._current_file == null or this._current_file.?.id != file_id) {
+            try this.writeSnapshotFile();
+
+            const test_file = Jest.runner.?.files.get(file_id);
+            const test_filename = test_file.source.path.name.filename;
+            const dir_path = test_file.source.path.name.dirWithTrailingSlash();
+
+            var snapshot_file_path_buf: [bun.MAX_PATH_BYTES]u8 = undefined;
+            var remain: []u8 = snapshot_file_path_buf[0..bun.MAX_PATH_BYTES];
+            bun.copy(u8, remain, dir_path);
+            remain = remain[dir_path.len..];
+            bun.copy(u8, remain, "__snapshots__/");
+            remain = remain["__snapshots__/".len..];
+
+            if (this.snapshot_dir_path == null or !strings.eqlLong(dir_path, this.snapshot_dir_path.?, true)) {
+                remain[0] = 0;
+                const snapshot_dir_path = snapshot_file_path_buf[0 .. snapshot_file_path_buf.len - remain.len :0];
+                switch (JSC.Node.Syscall.mkdir(snapshot_dir_path, 0o777)) {
+                    .result => this.snapshot_dir_path = dir_path,
+                    .err => |err| {
+                        switch (err.getErrno()) {
+                            std.os.E.EXIST => this.snapshot_dir_path = dir_path,
+                            else => return JSC.Maybe(void){
+                                .err = err,
+                            },
+                        }
+                    },
+                }
+            }
+
+            bun.copy(u8, remain, test_filename);
+            remain = remain[test_filename.len..];
+            bun.copy(u8, remain, ".snap");
+            remain = remain[".snap".len..];
+            remain[0] = 0;
+            const snapshot_file_path = snapshot_file_path_buf[0 .. snapshot_file_path_buf.len - remain.len :0];
+
+            var flags: JSC.Node.Mode = std.os.O.CREAT | std.os.O.RDWR;
+            if (this.update_snapshots) flags |= std.os.O.TRUNC;
+            const fd = switch (JSC.Node.Syscall.open(snapshot_file_path, flags, 0o644)) {
+                .result => |_fd| _fd,
+                .err => |err| return JSC.Maybe(void){
+                    .err = err,
+                },
+            };
+
+            var file: File = .{
+                .id = file_id,
+                .file = .{ .handle = fd },
+            };
+
+            if (this.update_snapshots) {
+                try this.file_buf.appendSlice(file_header);
+            } else {
+                const length = try file.file.getEndPos();
+                if (length == 0) {
+                    try this.file_buf.appendSlice(file_header);
+                } else {
+                    const buf = try this.allocator.alloc(u8, length);
+                    _ = try file.file.preadAll(buf, 0);
+                    try this.file_buf.appendSlice(buf);
+                    this.allocator.free(buf);
+                }
+            }
+
+            this._current_file = file;
+            try this.parseFile();
+        }
+
+        return JSC.Maybe(void).success;
+    }
+};
diff --git a/src/bun.js/uuid.zig b/src/bun.js/uuid.zig
index e8bdff661..e38ed567f 100644
--- a/src/bun.js/uuid.zig
+++ b/src/bun.js/uuid.zig
@@ -18,6 +18,16 @@ pub fn init() UUID {
     uuid.bytes[6] = (uuid.bytes[6] & 0x0f) | 0x40;
     // Variant 1
     uuid.bytes[8] = (uuid.bytes[8] & 0x3f) | 0x80;
+
+    return uuid;
+}
+
+pub fn initWith(bytes: *const [16]u8) UUID {
+    var uuid = UUID{ .bytes = bytes.* };
+
+    uuid.bytes[6] = (uuid.bytes[6] & 0x0f) | 0x40;
+    uuid.bytes[8] = (uuid.bytes[8] & 0x3f) | 0x80;
+
     return uuid;
 }
 
diff --git a/src/bun.js/webcore.zig b/src/bun.js/webcore.zig
index e7357e4ca..8cb9ec80a 100644
--- a/src/bun.js/webcore.zig
+++ b/src/bun.js/webcore.zig
@@ -10,6 +10,7 @@ const std = @import("std");
 const bun = @import("root").bun;
 const string = bun.string;
 pub const AbortSignal = @import("./bindings/bindings.zig").AbortSignal;
+pub const JSValue = @import("./bindings/bindings.zig").JSValue;
 
 pub const Lifetime = enum {
     clone,
@@ -365,7 +366,6 @@ pub const Prompt = struct {
 };
 
 pub const Crypto = struct {
-    const UUID = @import("./uuid.zig");
     const BoringSSL = @import("root").bun.BoringSSL;
     pub const Class = JSC.NewClass(
         void,
@@ -374,6 +374,7 @@ pub const Crypto = struct {
             .getRandomValues = JSC.DOMCall("Crypto", @This(), "getRandomValues", JSC.JSValue, JSC.DOMEffect.top),
             .randomUUID = JSC.DOMCall("Crypto", @This(), "randomUUID", *JSC.JSString, JSC.DOMEffect.top),
             .timingSafeEqual = JSC.DOMCall("Crypto", @This(), "timingSafeEqual", JSC.JSValue, JSC.DOMEffect.top),
+            .randomInt = .{ .rfn = &JSC.wrapWithHasContainer(Crypto, "randomInt", false, false, false) },
             .scryptSync = .{ .rfn = &JSC.wrapWithHasContainer(Crypto, "scryptSync", false, false, false) },
         },
         .{},
@@ -663,7 +664,7 @@ pub const Crypto = struct {
     ) callconv(.C) JSC.JSValue {
         var slice = array.slice();
         randomData(globalThis, slice.ptr, slice.len);
-        return @intToEnum(JSC.JSValue, @bitCast(i64, @ptrToInt(array)));
+        return @enumFromInt(JSC.JSValue, @bitCast(i64, @intFromPtr(array)));
     }
 
     fn randomData(
@@ -691,21 +692,37 @@ pub const Crypto = struct {
         _: []const JSC.JSValue,
     ) JSC.JSValue {
         var out: [36]u8 = undefined;
-        const uuid: UUID = .{
-            .bytes = globalThis.bunVM().rareData().nextUUID(),
-        };
+        const uuid = globalThis.bunVM().rareData().nextUUID();
+
         uuid.print(&out);
         return JSC.ZigString.init(&out).toValueGC(globalThis);
     }
 
+    pub fn randomInt(globalThis: *JSC.JSGlobalObject, min_value: ?JSValue, max_value: ?JSValue) JSValue {
+        _ = globalThis;
+
+        var at_least: u52 = 0;
+        var at_most: u52 = std.math.maxInt(u52);
+
+        if (min_value) |min| {
+            if (max_value) |max| {
+                if (min.isNumber()) at_least = min.to(u52);
+                if (max.isNumber()) at_most = max.to(u52);
+            } else {
+                if (min.isNumber()) at_most = min.to(u52);
+            }
+        }
+
+        return JSValue.jsNumberFromUint64(std.crypto.random.intRangeAtMost(u52, at_least, at_most));
+    }
+
     pub fn randomUUIDWithoutTypeChecks(
         globalThis: *JSC.JSGlobalObject,
         _: *anyopaque,
     ) callconv(.C) JSC.JSValue {
         var out: [36]u8 = undefined;
-        const uuid: UUID = .{
-            .bytes = globalThis.bunVM().rareData().nextUUID(),
-        };
+        const uuid = globalThis.bunVM().rareData().nextUUID();
+
         uuid.print(&out);
         return JSC.ZigString.init(&out).toValueGC(globalThis);
     }
diff --git a/src/bun.js/webcore/blob.zig b/src/bun.js/webcore/blob.zig
index 9b3ddb8df..ef2520049 100644
--- a/src/bun.js/webcore/blob.zig
+++ b/src/bun.js/webcore/blob.zig
@@ -249,7 +249,7 @@ pub const Blob = struct {
 
         var hex_buf: [70]u8 = undefined;
         const boundary = brk: {
-            var random = globalThis.bunVM().rareData().nextUUID();
+            var random = globalThis.bunVM().rareData().nextUUID().bytes;
             var formatter = std.fmt.fmtSliceHexLower(&random);
             break :brk std.fmt.bufPrint(&hex_buf, "-WebkitFormBoundary{any}", .{formatter}) catch unreachable;
         };
@@ -319,6 +319,7 @@ pub const Blob = struct {
             },
         );
     }
+
     pub fn writeFormat(this: *const Blob, comptime Formatter: type, formatter: *Formatter, writer: anytype, comptime enable_ansi_colors: bool) !void {
         const Writer = @TypeOf(writer);
 
@@ -546,7 +547,7 @@ pub const Blob = struct {
             return JSPromise.resolvedPromiseValue(ctx.ptr(), cloned.toJS(ctx)).asObjectRef();
         } else if (destination_type == .bytes and source_type == .file) {
             var fake_call_frame: [8]JSC.JSValue = undefined;
-            @memset(@ptrCast([*]u8, &fake_call_frame), 0, @sizeOf(@TypeOf(fake_call_frame)));
+            @memset(@ptrCast([*]u8, &fake_call_frame)[0..@sizeOf(@TypeOf(fake_call_frame))], 0);
             const blob_value =
                 source_blob.getSlice(ctx, @ptrCast(*JSC.CallFrame, &fake_call_frame));
 
@@ -599,11 +600,14 @@ pub const Blob = struct {
         }
 
         var needs_async = false;
+
         if (data.isString()) {
+            defer if (!needs_async and path_or_blob == .path) path_or_blob.path.deinit();
+
             const len = data.getLength(ctx);
 
             if (len < 256 * 1024 or bun.isMissingIOUring()) {
-                const str = data.getZigString(ctx);
+                const str = data.toBunString(ctx);
 
                 const pathlike: JSC.Node.PathOrFileDescriptor = if (path_or_blob == .path)
                     path_or_blob.path
@@ -635,6 +639,8 @@ pub const Blob = struct {
                 }
             }
         } else if (data.asArrayBuffer(ctx)) |buffer_view| {
+            defer if (!needs_async and path_or_blob == .path) path_or_blob.path.deinit();
+
             if (buffer_view.byte_len < 256 * 1024 or bun.isMissingIOUring()) {
                 const pathlike: JSC.Node.PathOrFileDescriptor = if (path_or_blob == .path)
                     path_or_blob.path
@@ -784,7 +790,7 @@ pub const Blob = struct {
     fn writeStringToFileFast(
         globalThis: *JSC.JSGlobalObject,
         pathlike: JSC.Node.PathOrFileDescriptor,
-        str: ZigString,
+        str: bun.String,
         needs_async: *bool,
         comptime needs_open: bool,
     ) JSC.JSValue {
@@ -807,7 +813,7 @@ pub const Blob = struct {
             unreachable;
         };
 
-        var truncate = needs_open or str.len == 0;
+        var truncate = needs_open or str.isEmpty();
         var jsc_vm = globalThis.bunVM();
         var written: usize = 0;
 
@@ -822,62 +828,12 @@ pub const Blob = struct {
                 _ = JSC.Node.Syscall.close(fd);
             }
         }
-        if (str.len == 0) {} else if (str.is16Bit()) {
-            var decoded = str.toSlice(jsc_vm.allocator);
+        if (!str.isEmpty()) {
+            var decoded = str.toUTF8(jsc_vm.allocator);
             defer decoded.deinit();
 
             var remain = decoded.slice();
-            const end = remain.ptr + remain.len;
-
-            while (remain.ptr != end) {
-                const result = JSC.Node.Syscall.write(fd, remain);
-                switch (result) {
-                    .result => |res| {
-                        written += res;
-                        remain = remain[res..];
-                        if (res == 0) break;
-                    },
-                    .err => |err| {
-                        truncate = false;
-                        if (err.getErrno() == .AGAIN) {
-                            needs_async.* = true;
-                            return .zero;
-                        }
-                        return JSC.JSPromise.rejectedPromiseValue(globalThis, err.toJSC(globalThis));
-                    },
-                }
-            }
-        } else if (str.isUTF8() or strings.isAllASCII(str.slice())) {
-            var remain = str.slice();
-            const end = remain.ptr + remain.len;
-
-            while (remain.ptr != end) {
-                const result = JSC.Node.Syscall.write(fd, remain);
-                switch (result) {
-                    .result => |res| {
-                        written += res;
-                        remain = remain[res..];
-                        if (res == 0) break;
-                    },
-                    .err => |err| {
-                        truncate = false;
-                        if (err.getErrno() == .AGAIN) {
-                            needs_async.* = true;
-                            return .zero;
-                        }
-
-                        return JSC.JSPromise.rejectedPromiseValue(globalThis, err.toJSC(globalThis));
-                    },
-                }
-            }
-        } else {
-            var decoded = str.toOwnedSlice(jsc_vm.allocator) catch {
-                return JSC.JSPromise.rejectedPromiseValue(globalThis, ZigString.static("Out of memory").toErrorInstance(globalThis));
-            };
-            defer jsc_vm.allocator.free(decoded);
-            var remain = decoded;
-            const end = remain.ptr + remain.len;
-            while (remain.ptr != end) {
+            while (remain.len > 0) {
                 const result = JSC.Node.Syscall.write(fd, remain);
                 switch (result) {
                     .result => |res| {
@@ -997,6 +953,13 @@ pub const Blob = struct {
             switch (path_) {
                 .path => {
                     const slice = path_.path.slice();
+
+                    if (vm.standalone_module_graph) |graph| {
+                        if (graph.find(slice)) |file| {
+                            return file.blob(globalThis).dupe();
+                        }
+                    }
+
                     var cloned = (allocator.dupeZ(u8, slice) catch unreachable)[0..slice.len];
 
                     break :brk .{
@@ -1231,9 +1194,6 @@ pub const Blob = struct {
                                     .syscall = .open,
                                 }).toSystemError();
 
-                                // assert we never end up reusing the memory
-                                std.debug.assert(@ptrToInt(this.system_error.?.path.slice().ptr) != @ptrToInt(path_buffer));
-
                                 callback(this, null_fd);
                                 return;
                             };
@@ -1396,12 +1356,13 @@ pub const Blob = struct {
                     return;
                 } else if (this.store == null) {
                     bun.default_allocator.destroy(this);
-                    cb(cb_ctx, ResultType{ .err = SystemError{
-                        .code = ZigString.init("INTERNAL_ERROR"),
-                        .path = ZigString.Empty,
-                        .message = ZigString.init("assertion failure - store should not be null"),
-                        .syscall = ZigString.init("read"),
-                    } });
+                    cb(cb_ctx, ResultType{
+                        .err = SystemError{
+                            .code = bun.String.static("INTERNAL_ERROR"),
+                            .message = bun.String.static("assertion failure - store should not be null"),
+                            .syscall = bun.String.static("read"),
+                        },
+                    });
                     return;
                 }
 
@@ -1433,12 +1394,12 @@ pub const Blob = struct {
                         }).toSystemError();
                     } else {
                         this.system_error = JSC.SystemError{
-                            .code = ZigString.init(bun.asByteSlice(@errorName(err))),
+                            .code = bun.String.static(bun.asByteSlice(@errorName(err))),
                             .path = if (this.file_store.pathlike == .path)
-                                ZigString.init(this.file_store.pathlike.path.slice())
+                                bun.String.create(this.file_store.pathlike.path.slice())
                             else
-                                ZigString.Empty,
-                            .syscall = ZigString.init("read"),
+                                bun.String.empty,
+                            .syscall = bun.String.static("read"),
                         };
 
                         this.errno = err;
@@ -1495,13 +1456,13 @@ pub const Blob = struct {
                 if (std.os.S.ISDIR(stat.mode)) {
                     this.errno = error.EISDIR;
                     this.system_error = JSC.SystemError{
-                        .code = ZigString.init("EISDIR"),
+                        .code = bun.String.static("EISDIR"),
                         .path = if (this.file_store.pathlike == .path)
-                            ZigString.init(this.file_store.pathlike.path.slice())
+                            bun.String.create(this.file_store.pathlike.path.slice())
                         else
-                            ZigString.Empty,
-                        .message = ZigString.init("Directories cannot be read like files"),
-                        .syscall = ZigString.init("read"),
+                            bun.String.empty,
+                        .message = bun.String.static("Directories cannot be read like files"),
+                        .syscall = bun.String.static("read"),
                     };
                     return;
                 }
@@ -1680,8 +1641,8 @@ pub const Blob = struct {
                 this.wrote += @truncate(SizeType, result catch |errno| {
                     this.errno = errno;
                     this.system_error = this.system_error orelse JSC.SystemError{
-                        .code = ZigString.init(bun.asByteSlice(@errorName(errno))),
-                        .syscall = ZigString.init("write"),
+                        .code = bun.String.static(bun.asByteSlice(@errorName(errno))),
+                        .syscall = bun.String.static("write"),
                     };
 
                     this.wrote = 0;
@@ -1739,14 +1700,14 @@ pub const Blob = struct {
         };
 
         const unsupported_directory_error = SystemError{
-            .errno = @intCast(c_int, @enumToInt(bun.C.SystemErrno.EISDIR)),
-            .message = ZigString.init("That doesn't work on folders"),
-            .syscall = ZigString.init("fstat"),
+            .errno = @intCast(c_int, @intFromEnum(bun.C.SystemErrno.EISDIR)),
+            .message = bun.String.static("That doesn't work on folders"),
+            .syscall = bun.String.static("fstat"),
         };
         const unsupported_non_regular_file_error = SystemError{
-            .errno = @intCast(c_int, @enumToInt(bun.C.SystemErrno.ENOTSUP)),
-            .message = ZigString.init("Non-regular files aren't supported yet"),
-            .syscall = ZigString.init("fstat"),
+            .errno = @intCast(c_int, @intFromEnum(bun.C.SystemErrno.ENOTSUP)),
+            .message = bun.String.static("Non-regular files aren't supported yet"),
+            .syscall = bun.String.static("fstat"),
         };
 
         // blocking, but off the main thread
@@ -1814,13 +1775,12 @@ pub const Blob = struct {
             pub fn reject(this: *CopyFile, promise: *JSC.JSPromise) void {
                 var globalThis = this.globalThis;
                 var system_error: SystemError = this.system_error orelse SystemError{};
-                if (this.source_file_store.pathlike == .path and system_error.path.len == 0) {
-                    system_error.path = ZigString.init(this.source_file_store.pathlike.path.slice());
-                    system_error.path.mark();
+                if (this.source_file_store.pathlike == .path and system_error.path.isEmpty()) {
+                    system_error.path = bun.String.create(this.source_file_store.pathlike.path.slice());
                 }
 
-                if (system_error.message.len == 0) {
-                    system_error.message = ZigString.init("Failed to copy file");
+                if (system_error.message.isEmpty()) {
+                    system_error.message = bun.String.static("Failed to copy file");
                 }
 
                 var instance = system_error.toErrorInstance(this.globalThis);
@@ -1978,14 +1938,14 @@ pub const Blob = struct {
                             }
 
                             this.system_error = (JSC.Node.Syscall.Error{
-                                .errno = @intCast(JSC.Node.Syscall.Error.Int, @enumToInt(linux.E.INVAL)),
+                                .errno = @intCast(JSC.Node.Syscall.Error.Int, @intFromEnum(linux.E.INVAL)),
                                 .syscall = TryWith.tag.get(use).?,
                             }).toSystemError();
                             return AsyncIO.asError(linux.E.INVAL);
                         },
                         else => |errno| {
                             this.system_error = (JSC.Node.Syscall.Error{
-                                .errno = @intCast(JSC.Node.Syscall.Error.Int, @enumToInt(errno)),
+                                .errno = @intCast(JSC.Node.Syscall.Error.Int, @intFromEnum(errno)),
                                 .syscall = TryWith.tag.get(use).?,
                             }).toSystemError();
                             return AsyncIO.asError(errno);
@@ -2000,7 +1960,7 @@ pub const Blob = struct {
             }
 
             pub fn doFCopyFile(this: *CopyFile) anyerror!void {
-                switch (JSC.Node.Syscall.fcopyfile(this.source_fd, this.destination_fd, os.system.COPYFILE_DATA)) {
+                switch (JSC.Node.Syscall.fcopyfile(this.source_fd, this.destination_fd, os.system.COPYFILE.DATA)) {
                     .err => |errno| {
                         this.system_error = errno.toSystemError();
 
@@ -2240,6 +2200,9 @@ pub const Blob = struct {
         cap: SizeType = 0,
         allocator: std.mem.Allocator,
 
+        /// Used by standalone module graph
+        stored_name: bun.PathString = bun.PathString.empty,
+
         pub fn init(bytes: []u8, allocator: std.mem.Allocator) ByteStore {
             return .{
                 .ptr = bytes.ptr,
@@ -2359,6 +2322,34 @@ pub const Blob = struct {
         return promisified(this.toFormData(globalThis, .temporary), globalThis);
     }
 
+    fn getExistsSync(this: *Blob) JSC.JSValue {
+        if (this.size == Blob.max_size) {
+            this.resolveSize();
+        }
+
+        // If there's no store that means it's empty and we just return true
+        // it will not error to return an empty Blob
+        var store = this.store orelse return JSValue.jsBoolean(true);
+
+        if (store.data == .bytes) {
+            // Bytes will never error
+            return JSValue.jsBoolean(true);
+        }
+
+        // We say regular files and pipes exist.
+        // This is mostly meant for "Can we use this in new Response(file)?"
+        return JSValue.jsBoolean(std.os.S.ISREG(store.data.file.mode) or std.os.S.ISFIFO(store.data.file.mode));
+    }
+
+    // This mostly means 'can it be read?'
+    pub fn getExists(
+        this: *Blob,
+        globalThis: *JSC.JSGlobalObject,
+        _: *JSC.CallFrame,
+    ) callconv(.C) JSValue {
+        return JSC.JSPromise.resolvedPromiseValue(globalThis, this.getExistsSync());
+    }
+
     pub fn getWriter(
         this: *Blob,
         globalThis: *JSC.JSGlobalObject,
@@ -2573,17 +2564,31 @@ pub const Blob = struct {
         this: *Blob,
         globalThis: *JSC.JSGlobalObject,
     ) callconv(.C) JSValue {
+        if (this.getFileName()) |path| {
+            var str = bun.String.create(path);
+            return str.toJS(globalThis);
+        }
+
+        return JSValue.undefined;
+    }
+
+    pub fn getFileName(
+        this: *const Blob,
+    ) ?[]const u8 {
         if (this.store) |store| {
             if (store.data == .file) {
                 if (store.data.file.pathlike == .path) {
-                    return ZigString.fromUTF8(store.data.file.pathlike.path.slice()).toValueGC(globalThis);
+                    return store.data.file.pathlike.path.slice();
                 }
 
                 // we shouldn't return Number here.
+            } else if (store.data == .bytes) {
+                if (store.data.bytes.stored_name.slice().len > 0)
+                    return store.data.bytes.stored_name.slice();
             }
         }
 
-        return JSC.JSValue.jsUndefined();
+        return null;
     }
 
     // TODO: Move this to a separate `File` object or BunFile
@@ -3514,6 +3519,14 @@ pub const AnyBlob = union(enum) {
     InternalBlob: InternalBlob,
     WTFStringImpl: bun.WTF.StringImpl,
 
+    pub fn getFileName(this: *const AnyBlob) ?[]const u8 {
+        return switch (this.*) {
+            .Blob => this.Blob.getFileName(),
+            .WTFStringImpl => null,
+            .InternalBlob => null,
+        };
+    }
+
     pub inline fn fastSize(this: *const AnyBlob) Blob.SizeType {
         return switch (this.*) {
             .Blob => this.Blob.size,
@@ -3816,10 +3829,10 @@ pub const InlineBlob = extern struct {
         var bytes_slice = inline_blob.bytes[0..total];
 
         if (first.len > 0)
-            @memcpy(bytes_slice.ptr, first.ptr, first.len);
+            @memcpy(bytes_slice[0..first.len], first);
 
         if (second.len > 0)
-            @memcpy(bytes_slice.ptr + first.len, second.ptr, second.len);
+            @memcpy(bytes_slice[first.len..][0..second.len], second);
 
         inline_blob.len = @truncate(@TypeOf(inline_blob.len), total);
         return inline_blob;
@@ -3834,7 +3847,7 @@ pub const InlineBlob = extern struct {
         };
 
         if (data.len > 0)
-            @memcpy(&blob.bytes, data.ptr, data.len);
+            @memcpy(blob.bytes[0..data.len], data);
         return blob;
     }
 
diff --git a/src/bun.js/webcore/body.zig b/src/bun.js/webcore/body.zig
index df3ba3ce1..028b104b2 100644
--- a/src/bun.js/webcore/body.zig
+++ b/src/bun.js/webcore/body.zig
@@ -496,8 +496,10 @@ pub const Body = struct {
                     locked.readable.?.value.protect();
                     return locked.readable.?.value;
                 },
-
-                else => unreachable,
+                .Error => {
+                    // TODO: handle error properly
+                    return JSC.WebCore.ReadableStream.empty(globalThis);
+                },
             }
         }
 
diff --git a/src/bun.js/webcore/encoding.zig b/src/bun.js/webcore/encoding.zig
index e4b8a4b95..dd47ccc29 100644
--- a/src/bun.js/webcore/encoding.zig
+++ b/src/bun.js/webcore/encoding.zig
@@ -68,7 +68,7 @@ pub const TextEncoder = struct {
             std.debug.assert(result.read == slice.len);
             const array_buffer = uint8array.asArrayBuffer(globalThis).?;
             std.debug.assert(result.written == array_buffer.len);
-            @memcpy(array_buffer.byteSlice().ptr, &buf, result.written);
+            @memcpy(array_buffer.byteSlice()[0..result.written], buf[0..result.written]);
             return uint8array;
         } else {
             const bytes = strings.allocateLatin1IntoUTF8(globalThis.bunVM().allocator, []const u8, slice) catch {
@@ -103,7 +103,7 @@ pub const TextEncoder = struct {
                 const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, 3);
                 const array_buffer = uint8array.asArrayBuffer(globalThis).?;
                 const replacement_char = [_]u8{ 239, 191, 189 };
-                @memcpy(array_buffer.slice().ptr, &replacement_char, replacement_char.len);
+                @memcpy(array_buffer.slice()[0..replacement_char.len], &replacement_char);
                 return uint8array;
             }
             const uint8array = JSC.JSValue.createUninitializedUint8Array(globalThis, result.written);
@@ -111,7 +111,7 @@ pub const TextEncoder = struct {
             std.debug.assert(result.read == slice.len);
             const array_buffer = uint8array.asArrayBuffer(globalThis).?;
             std.debug.assert(result.written == array_buffer.len);
-            @memcpy(array_buffer.slice().ptr, &buf, result.written);
+            @memcpy(array_buffer.slice()[0..result.written], buf[0..result.written]);
             return uint8array;
         } else {
             var bytes = strings.toUTF8AllocWithType(
@@ -207,7 +207,7 @@ pub const TextEncoder = struct {
         if (array.isEmpty()) {
             array = JSC.JSValue.createUninitializedUint8Array(globalThis, length);
             array.ensureStillAlive();
-            @memcpy(array.asArrayBuffer(globalThis).?.ptr, buf_to_use.ptr, length);
+            @memcpy(array.asArrayBuffer(globalThis).?.ptr[0..length], buf_to_use[0..length]);
         }
 
         return array;
@@ -224,7 +224,7 @@ pub const TextEncoder = struct {
         var result: strings.EncodeIntoResult = strings.copyUTF16IntoUTF8(output, []const u16, input, false);
         if (output.len >= 3 and (result.read == 0 or result.written == 0)) {
             const replacement_char = [_]u8{ 239, 191, 189 };
-            @memcpy(buf_ptr, &replacement_char, replacement_char.len);
+            @memcpy(buf_ptr[0..replacement_char.len], &replacement_char);
             result.read = 1;
             result.written = 3;
         }
@@ -515,10 +515,10 @@ pub const TextDecoder = struct {
         buffer.ensureTotalCapacity(allocator, slice.len) catch unreachable;
         buffer.items.len = i;
 
+        var len = std.mem.sliceAsBytes(slice[0..i]).len;
         @memcpy(
-            std.mem.sliceAsBytes(buffer.items).ptr,
-            std.mem.sliceAsBytes(slice).ptr,
-            std.mem.sliceAsBytes(slice[0..i]).len,
+            std.mem.sliceAsBytes(buffer.items)[0..len],
+            std.mem.sliceAsBytes(slice)[0..len],
         );
 
         const first_high_surrogate = 0xD800;
@@ -537,10 +537,10 @@ pub const TextDecoder = struct {
                     const prev = buffer.items.len;
                     buffer.items.len += count;
                     // Since this string is freshly allocated, we know it's not going to overlap
+                    len = std.mem.sliceAsBytes(remainder[0..count]).len;
                     @memcpy(
-                        std.mem.sliceAsBytes(buffer.items[prev..]).ptr,
-                        std.mem.sliceAsBytes(remainder).ptr,
-                        std.mem.sliceAsBytes(remainder[0..count]).len,
+                        std.mem.sliceAsBytes(buffer.items[prev..])[0..len],
+                        std.mem.sliceAsBytes(remainder)[0..len],
                     );
                     remainder = remainder[count..];
                 },
@@ -659,7 +659,7 @@ pub const TextDecoder = struct {
             },
 
             EncodingLabel.@"UTF-16LE" => {
-                if (std.mem.isAligned(@ptrToInt(buffer_slice.ptr), @alignOf([*]const u16))) {
+                if (std.mem.isAligned(@intFromPtr(buffer_slice.ptr), @alignOf([*]const u16))) {
                     return this.decodeUTF16WithAlignment([]const u16, @alignCast(2, std.mem.bytesAsSlice(u16, buffer_slice)), globalThis);
                 }
 
@@ -701,7 +701,7 @@ pub const TextDecoder = struct {
 
 pub const Encoder = struct {
     export fn Bun__encoding__writeLatin1(input: [*]const u8, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
-        return switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        return switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .utf8 => writeU8(input, len, to, to_len, .utf8),
             .latin1 => writeU8(input, len, to, to_len, .ascii),
             .ascii => writeU8(input, len, to, to_len, .ascii),
@@ -714,7 +714,7 @@ pub const Encoder = struct {
         } catch 0;
     }
     export fn Bun__encoding__writeUTF16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, encoding: u8) usize {
-        return switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        return switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .utf8 => writeU16(input, len, to, to_len, .utf8, false),
             .latin1 => writeU16(input, len, to, to_len, .ascii, false),
             .ascii => writeU16(input, len, to, to_len, .ascii, false),
@@ -727,7 +727,7 @@ pub const Encoder = struct {
         } catch 0;
     }
     export fn Bun__encoding__byteLengthLatin1(input: [*]const u8, len: usize, encoding: u8) usize {
-        return switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        return switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .utf8 => byteLengthU8(input, len, .utf8),
             .latin1 => byteLengthU8(input, len, .ascii),
             .ascii => byteLengthU8(input, len, .ascii),
@@ -740,7 +740,7 @@ pub const Encoder = struct {
         };
     }
     export fn Bun__encoding__byteLengthUTF16(input: [*]const u16, len: usize, encoding: u8) usize {
-        return switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        return switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .utf8 => byteLengthU16(input, len, .utf8),
             .latin1 => byteLengthU16(input, len, .ascii),
             .ascii => byteLengthU16(input, len, .ascii),
@@ -753,7 +753,7 @@ pub const Encoder = struct {
         };
     }
     export fn Bun__encoding__constructFromLatin1(globalObject: *JSGlobalObject, input: [*]const u8, len: usize, encoding: u8) JSValue {
-        var slice = switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        var slice = switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .hex => constructFromU8(input, len, .hex),
             .ascii => constructFromU8(input, len, .ascii),
             .base64url => constructFromU8(input, len, .base64url),
@@ -766,7 +766,7 @@ pub const Encoder = struct {
         return JSC.JSValue.createBuffer(globalObject, slice, globalObject.bunVM().allocator);
     }
     export fn Bun__encoding__constructFromUTF16(globalObject: *JSGlobalObject, input: [*]const u16, len: usize, encoding: u8) JSValue {
-        var slice = switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        var slice = switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .base64 => constructFromU16(input, len, .base64),
             .hex => constructFromU16(input, len, .hex),
             .base64url => constructFromU16(input, len, .base64url),
@@ -785,7 +785,7 @@ pub const Encoder = struct {
     }
 
     export fn Bun__encoding__toString(input: [*]const u8, len: usize, globalObject: *JSC.JSGlobalObject, encoding: u8) JSValue {
-        return switch (@intToEnum(JSC.Node.Encoding, encoding)) {
+        return switch (@enumFromInt(JSC.Node.Encoding, encoding)) {
             .ucs2 => toString(input, len, globalObject, .utf16le),
             .utf16le => toString(input, len, globalObject, .utf16le),
             .utf8 => toString(input, len, globalObject, .utf8),
@@ -802,7 +802,20 @@ pub const Encoder = struct {
     // pub fn writeUTF16AsUTF8(utf16: [*]const u16, len: usize, to: [*]u8, to_len: usize) callconv(.C) i32 {
     //     return @intCast(i32, strings.copyUTF16IntoUTF8(to[0..to_len], []const u16, utf16[0..len], true).written);
     // }
-
+    pub fn toStringAtRuntime(input: [*]const u8, len: usize, globalObject: *JSGlobalObject, encoding: JSC.Node.Encoding) JSValue {
+        return switch (encoding) {
+            .ucs2 => toString(input, len, globalObject, .utf16le),
+            .utf16le => toString(input, len, globalObject, .utf16le),
+            .utf8 => toString(input, len, globalObject, .utf8),
+            .ascii => toString(input, len, globalObject, .ascii),
+            .hex => toString(input, len, globalObject, .hex),
+            .base64 => toString(input, len, globalObject, .base64),
+            .base64url => toString(input, len, globalObject, .base64url),
+            .latin1 => toString(input, len, globalObject, .latin1),
+            // treat everything else as utf8
+            else => toString(input, len, globalObject, .utf8),
+        };
+    }
     pub fn toString(input_ptr: [*]const u8, len: usize, global: *JSGlobalObject, comptime encoding: JSC.Node.Encoding) JSValue {
         if (len == 0)
             return ZigString.Empty.toValue(global);
@@ -816,23 +829,18 @@ pub const Encoder = struct {
                     return ZigString.init(input).toValueGC(global);
                 }
 
-                if (input.len < 512) {
-                    var buf: [512]u8 = undefined;
-                    var to = buf[0..input.len];
-                    strings.copyLatin1IntoASCII(to, input);
-                    return ZigString.init(to).toValueGC(global);
-                }
-
-                var to = allocator.alloc(u8, len) catch return ZigString.init("Out of memory").toErrorInstance(global);
-                strings.copyLatin1IntoASCII(to, input);
-                return ZigString.init(to).toExternalValue(global);
+                var str = bun.String.createUninitialized(.latin1, len) orelse return ZigString.init("Out of memory").toErrorInstance(global);
+                defer str.deref();
+                strings.copyLatin1IntoASCII(@constCast(str.latin1()), input);
+                return str.toJS(global);
             },
             .latin1 => {
-                var to = allocator.alloc(u8, len) catch return ZigString.init("Out of memory").toErrorInstance(global);
+                var str = bun.String.createUninitialized(.latin1, len) orelse return ZigString.init("Out of memory").toErrorInstance(global);
+                defer str.deref();
 
-                @memcpy(to.ptr, input_ptr, to.len);
+                @memcpy(@constCast(str.latin1()), input_ptr[0..len]);
 
-                return ZigString.init(to).toExternalValue(global);
+                return str.toJS(global);
             },
             .buffer, .utf8 => {
                 const converted = strings.toUTF16Alloc(allocator, input, false) catch return ZigString.init("Out of memory").toErrorInstance(global);
@@ -848,21 +856,22 @@ pub const Encoder = struct {
                 // Avoid incomplete characters
                 if (len / 2 == 0) return ZigString.Empty.toValue(global);
 
-                var output = allocator.alloc(u16, len / 2) catch return ZigString.init("Out of memory").toErrorInstance(global);
-                var output_bytes = std.mem.sliceAsBytes(output);
+                var output = bun.String.createUninitialized(.utf16, len / 2) orelse return ZigString.init("Out of memory").toErrorInstance(global);
+                defer output.deref();
+                var output_bytes = std.mem.sliceAsBytes(@constCast(output.utf16()));
                 output_bytes[output_bytes.len - 1] = 0;
 
-                @memcpy(output_bytes.ptr, input_ptr, output_bytes.len);
-                return ZigString.toExternalU16(output.ptr, output.len, global);
+                @memcpy(output_bytes, input_ptr[0..output_bytes.len]);
+                return output.toJS(global);
             },
 
             .hex => {
-                var output = allocator.alloc(u8, input.len * 2) catch return ZigString.init("Out of memory").toErrorInstance(global);
+                var str = bun.String.createUninitialized(.latin1, len * 2) orelse return ZigString.init("Out of memory").toErrorInstance(global);
+                defer str.deref();
+                var output = @constCast(str.latin1());
                 const wrote = strings.encodeBytesToHex(output, input);
                 std.debug.assert(wrote == output.len);
-                var val = ZigString.init(output);
-                val.mark();
-                return val.toExternalValue(global);
+                return str.toJS(global);
             },
 
             .base64url => {
@@ -892,7 +901,7 @@ pub const Encoder = struct {
         switch (comptime encoding) {
             .buffer => {
                 const written = @min(len, to_len);
-                @memcpy(to_ptr, input, written);
+                @memcpy(to_ptr[0..written], input[0..written]);
 
                 return written;
             },
@@ -903,7 +912,7 @@ pub const Encoder = struct {
                 var remain = input[0..written];
 
                 if (bun.simdutf.validate.ascii(remain)) {
-                    @memcpy(to.ptr, remain.ptr, written);
+                    @memcpy(to_ptr[0..written], remain[0..written]);
                 } else {
                     strings.copyLatin1IntoASCII(to, remain);
                 }
@@ -919,7 +928,7 @@ pub const Encoder = struct {
                 if (to_len < 2)
                     return 0;
 
-                if (std.mem.isAligned(@ptrToInt(to_ptr), @alignOf([*]u16))) {
+                if (std.mem.isAligned(@intFromPtr(to_ptr), @alignOf([*]u16))) {
                     var buf = input[0..len];
 
                     var output = @ptrCast([*]u16, @alignCast(@alignOf(u16), to_ptr))[0 .. to_len / 2];
@@ -972,6 +981,14 @@ pub const Encoder = struct {
         }
     }
 
+    pub fn encodeIntoFrom16(input: []const u16, to: []u8, comptime encoding: JSC.Node.Encoding, comptime allow_partial_write: bool) !usize {
+        return writeU16(input.ptr, input.len, to.ptr, to.len, encoding, allow_partial_write);
+    }
+
+    pub fn encodeIntoFrom8(input: []const u8, to: []u8, comptime encoding: JSC.Node.Encoding) !usize {
+        return writeU8(input.ptr, input.len, to.ptr, to.len, encoding);
+    }
+
     pub fn writeU16(input: [*]const u16, len: usize, to: [*]u8, to_len: usize, comptime encoding: JSC.Node.Encoding, comptime allow_partial_write: bool) !usize {
         if (len == 0)
             return 0;
@@ -1065,14 +1082,14 @@ pub const Encoder = struct {
         switch (comptime encoding) {
             .buffer => {
                 var to = allocator.alloc(u8, len) catch return &[_]u8{};
-                @memcpy(to.ptr, input, len);
+                @memcpy(to[0..len], input[0..len]);
 
                 return to;
             },
             .latin1, .ascii => {
                 var to = allocator.alloc(u8, len) catch return &[_]u8{};
 
-                @memcpy(to.ptr, input, len);
+                @memcpy(to[0..len], input[0..len]);
 
                 return to;
             },
@@ -1121,7 +1138,7 @@ pub const Encoder = struct {
             .latin1, .buffer, .ascii => {
                 var to = allocator.alloc(u8, len) catch return &[_]u8{};
                 var input_bytes = std.mem.sliceAsBytes(input[0..len]);
-                @memcpy(to.ptr, input_bytes.ptr, input_bytes.len);
+                @memcpy(to[0..input_bytes.len], input_bytes);
                 for (to[0..len], 0..) |c, i| {
                     to[i] = @as(u8, @truncate(u7, c));
                 }
@@ -1131,7 +1148,8 @@ pub const Encoder = struct {
             // string is already encoded, just need to copy the data
             .ucs2, .utf16le => {
                 var to = std.mem.sliceAsBytes(allocator.alloc(u16, len * 2) catch return &[_]u8{});
-                @memcpy(to.ptr, std.mem.sliceAsBytes(input[0..len]).ptr, std.mem.sliceAsBytes(input[0..len]).len);
+                const bytes = std.mem.sliceAsBytes(input[0..len]);
+                @memcpy(to[0..bytes.len], bytes);
                 return to;
             },
 
diff --git a/src/bun.js/webcore/request.zig b/src/bun.js/webcore/request.zig
index a8c648212..cff2ef2f3 100644
--- a/src/bun.js/webcore/request.zig
+++ b/src/bun.js/webcore/request.zig
@@ -255,19 +255,7 @@ pub const Request = struct {
         this: *Request,
         globalThis: *JSC.JSGlobalObject,
     ) callconv(.C) JSC.JSValue {
-        const string_contents: string = switch (this.method) {
-            .GET => "GET",
-            .HEAD => "HEAD",
-            .PATCH => "PATCH",
-            .PUT => "PUT",
-            .POST => "POST",
-            .OPTIONS => "OPTIONS",
-            .CONNECT => "CONNECT",
-            .TRACE => "TRACE",
-            .DELETE => "DELETE",
-        };
-
-        return ZigString.init(string_contents).toValueGC(globalThis);
+        return bun.String.static(@tagName(this.method)).toJSConst(globalThis);
     }
 
     pub fn getMode(
@@ -472,8 +460,8 @@ pub const Request = struct {
                 url_or_object,
             if (is_first_argument_a_url) JSValue.undefined else url_or_object,
         };
-        const values_to_try = values_to_try_[0 .. @as(usize, @boolToInt(!is_first_argument_a_url)) +
-            @as(usize, @boolToInt(arguments.len > 1 and arguments[1].isObject()))];
+        const values_to_try = values_to_try_[0 .. @as(usize, @intFromBool(!is_first_argument_a_url)) +
+            @as(usize, @intFromBool(arguments.len > 1 and arguments[1].isObject()))];
 
         for (values_to_try) |value| {
             const value_type = value.jsType();
@@ -564,7 +552,7 @@ pub const Request = struct {
                         fields.insert(.url);
 
                     // first value
-                } else if (@enumToInt(value) == @enumToInt(values_to_try[values_to_try.len - 1]) and !is_first_argument_a_url and
+                } else if (@intFromEnum(value) == @intFromEnum(values_to_try[values_to_try.len - 1]) and !is_first_argument_a_url and
                     value.implementsToString(globalThis))
                 {
                     const slice = value.toSliceOrNull(globalThis) orelse {
diff --git a/src/bun.js/webcore/response.classes.ts b/src/bun.js/webcore/response.classes.ts
index b6ad452d2..c11cb10b2 100644
--- a/src/bun.js/webcore/response.classes.ts
+++ b/src/bun.js/webcore/response.classes.ts
@@ -132,6 +132,7 @@ export default [
       slice: { fn: "getSlice", length: 2 },
       stream: { fn: "getStream", length: 1 },
       formData: { fn: "getFormData" },
+      exists: { fn: "getExists", length: 0 },
 
       type: {
         getter: "getType",
diff --git a/src/bun.js/webcore/response.zig b/src/bun.js/webcore/response.zig
index beae4d182..e888ffa5a 100644
--- a/src/bun.js/webcore/response.zig
+++ b/src/bun.js/webcore/response.zig
@@ -386,7 +386,7 @@ pub const Response = struct {
 
         const json_value = args.nextEat() orelse JSC.JSValue.zero;
 
-        if (@enumToInt(json_value) != 0) {
+        if (@intFromEnum(json_value) != 0) {
             var zig_str = JSC.ZigString.init("");
             // calling JSON.stringify on an empty string adds extra quotes
             // so this is correct
@@ -448,7 +448,7 @@ pub const Response = struct {
         const url_string_value = args.nextEat() orelse JSC.JSValue.zero;
         var url_string = ZigString.init("");
 
-        if (@enumToInt(url_string_value) != 0) {
+        if (@intFromEnum(url_string_value) != 0) {
             url_string = url_string_value.getZigString(globalThis.ptr());
         }
         var url_string_slice = url_string.toSlice(getAllocator(globalThis));
@@ -777,15 +777,15 @@ pub const Fetch = struct {
             }
 
             const fetch_error = JSC.SystemError{
-                .code = ZigString.init(@errorName(this.result.fail)),
+                .code = bun.String.static(@errorName(this.result.fail)),
                 .message = switch (this.result.fail) {
-                    error.ConnectionClosed => ZigString.init("The socket connection was closed unexpectedly. For more information, pass `verbose: true` in the second argument to fetch()"),
-                    error.FailedToOpenSocket => ZigString.init("Was there a typo in the url or port?"),
-                    error.TooManyRedirects => ZigString.init("The response redirected too many times. For more information, pass `verbose: true` in the second argument to fetch()"),
-                    error.ConnectionRefused => ZigString.init("Unable to connect. Is the computer able to access the url?"),
-                    else => ZigString.init("fetch() failed. For more information, pass `verbose: true` in the second argument to fetch()"),
+                    error.ConnectionClosed => bun.String.static("The socket connection was closed unexpectedly. For more information, pass `verbose: true` in the second argument to fetch()"),
+                    error.FailedToOpenSocket => bun.String.static("Was there a typo in the url or port?"),
+                    error.TooManyRedirects => bun.String.static("The response redirected too many times. For more information, pass `verbose: true` in the second argument to fetch()"),
+                    error.ConnectionRefused => bun.String.static("Unable to connect. Is the computer able to access the url?"),
+                    else => bun.String.static("fetch() failed. For more information, pass `verbose: true` in the second argument to fetch()"),
                 },
-                .path = ZigString.init(this.http.?.url.href),
+                .path = bun.String.create(this.http.?.url.href),
             };
 
             return fetch_error.toErrorInstance(this.global_this);
@@ -1147,9 +1147,9 @@ pub const Fetch = struct {
                                 JSC.JSError(bun.default_allocator, "Out of memory", .{}, ctx, exception);
                                 return .zero;
                             };
-                            @memcpy(buffer.ptr, url_slice.ptr, url_slice.len);
+                            @memcpy(buffer[0..url_slice.len], url_slice);
                             var proxy_url_slice = buffer[url_slice.len..];
-                            @memcpy(proxy_url_slice.ptr, proxy_url_zig.ptr, proxy_url_zig.len);
+                            @memcpy(proxy_url_slice[0..proxy_url_zig.len], proxy_url_zig.ptr[0..proxy_url_zig.len]);
 
                             url = ZigURL.parse(buffer[0..url_slice.len]);
                             proxy = ZigURL.parse(proxy_url_slice);
@@ -1283,9 +1283,9 @@ pub const Fetch = struct {
                                 JSC.JSError(bun.default_allocator, "Out of memory", .{}, ctx, exception);
                                 return .zero;
                             };
-                            @memcpy(buffer.ptr, url_slice.ptr, url_slice.len);
+                            @memcpy(buffer[0..url_slice.len], url_slice.ptr[0..url_slice.len]);
                             var proxy_url_slice = buffer[url_slice.len..];
-                            @memcpy(proxy_url_slice.ptr, proxy_url_zig.ptr, proxy_url_zig.len);
+                            @memcpy(proxy_url_slice[0..proxy_url_zig.len], proxy_url_zig.ptr[0..proxy_url_zig.len]);
 
                             url = ZigURL.parse(buffer[0..url_slice.len]);
                             proxy = ZigURL.parse(proxy_url_slice);
@@ -1695,7 +1695,7 @@ pub const FetchEvent = struct {
 
         defer {
             if (!VirtualMachine.get().had_errors) {
-                Output.printElapsed(@intToFloat(f64, (request_context.timer.lap())) / std.time.ns_per_ms);
+                Output.printElapsed(@floatFromInt(f64, (request_context.timer.lap())) / std.time.ns_per_ms);
 
                 Output.prettyError(
                     " <b>{s}<r><d> - <b>{d}<r> <d>transpiled, <d><b>{d}<r> <d>imports<r>\n",
diff --git a/src/bun.js/webcore/streams.zig b/src/bun.js/webcore/streams.zig
index dddfcbaf5..343ce37ab 100644
--- a/src/bun.js/webcore/streams.zig
+++ b/src/bun.js/webcore/streams.zig
@@ -247,7 +247,7 @@ pub const ReadableStream = struct {
 
     pub fn fromNative(globalThis: *JSGlobalObject, id: Tag, ptr: *anyopaque) JSC.JSValue {
         JSC.markBinding(@src());
-        return ZigGlobalObject__createNativeReadableStream(globalThis, JSValue.fromPtr(ptr), JSValue.jsNumber(@enumToInt(id)));
+        return ZigGlobalObject__createNativeReadableStream(globalThis, JSValue.fromPtr(ptr), JSValue.jsNumber(@intFromEnum(id)));
     }
 
     pub fn fromBlob(globalThis: *JSGlobalObject, blob: *const Blob, recommended_chunk_size: Blob.SizeType) JSC.JSValue {
@@ -329,11 +329,11 @@ pub const ReadableStream = struct {
             const filedes_ = @bitCast([8]u8, @as(usize, @truncate(u56, @intCast(usize, filedes))));
             bytes[1..8].* = filedes_[0..7].*;
 
-            return @intToEnum(StreamTag, @bitCast(u64, bytes));
+            return @enumFromInt(StreamTag, @bitCast(u64, bytes));
         }
 
         pub fn fd(this: StreamTag) bun.FileDescriptor {
-            var bytes = @bitCast([8]u8, @enumToInt(this));
+            var bytes = @bitCast([8]u8, @intFromEnum(this));
             if (bytes[0] != 1) {
                 return bun.invalid_fd;
             }
@@ -780,13 +780,15 @@ pub const StreamResult = union(Tag) {
             .temporary => |temp| {
                 var array = JSC.JSValue.createUninitializedUint8Array(globalThis, temp.len);
                 var slice_ = array.asArrayBuffer(globalThis).?.slice();
-                @memcpy(slice_.ptr, temp.ptr, temp.len);
+                const temp_slice = temp.slice();
+                @memcpy(slice_[0..temp_slice.len], temp_slice);
                 return array;
             },
             .temporary_and_done => |temp| {
                 var array = JSC.JSValue.createUninitializedUint8Array(globalThis, temp.len);
                 var slice_ = array.asArrayBuffer(globalThis).?.slice();
-                @memcpy(slice_.ptr, temp.ptr, temp.len);
+                const temp_slice = temp.slice();
+                @memcpy(slice_[0..temp_slice.len], temp_slice);
                 return array;
             },
             .into_array => |array| {
@@ -818,7 +820,7 @@ pub const Signal = struct {
     ptr: *anyopaque = dead,
     vtable: VTable = VTable.Dead,
 
-    pub const dead = @intToPtr(*anyopaque, 0xaaaaaaaa);
+    pub const dead = @ptrFromInt(*anyopaque, 0xaaaaaaaa);
 
     pub fn clear(this: *Signal) void {
         this.ptr = dead;
@@ -920,7 +922,7 @@ pub const Sink = struct {
     used: bool = false,
 
     pub const pending = Sink{
-        .ptr = @intToPtr(*anyopaque, 0xaaaaaaaa),
+        .ptr = @ptrFromInt(*anyopaque, 0xaaaaaaaa),
         .vtable = undefined,
     };
 
@@ -961,7 +963,8 @@ pub const Sink = struct {
 
             if (stack_size >= str.len) {
                 var buf: [stack_size]u8 = undefined;
-                @memcpy(&buf, str.ptr, str.len);
+                @memcpy(buf[0..str.len], str);
+
                 strings.replaceLatin1WithUTF8(buf[0..str.len]);
                 if (input.isDone()) {
                     const result = writeFn(ctx, .{ .temporary_and_done = bun.ByteList.init(buf[0..str.len]) });
@@ -974,7 +977,8 @@ pub const Sink = struct {
 
             {
                 var slice = bun.default_allocator.alloc(u8, str.len) catch return .{ .err = Syscall.Error.oom };
-                @memcpy(slice.ptr, str.ptr, str.len);
+                @memcpy(slice[0..str.len], str);
+
                 strings.replaceLatin1WithUTF8(slice[0..str.len]);
                 if (input.isDone()) {
                     return writeFn(ctx, .{ .owned_and_done = bun.ByteList.init(slice) });
@@ -1262,7 +1266,7 @@ pub const FileSink = struct {
 
         const initial_remain = remain;
         defer {
-            std.debug.assert(total - initial == @ptrToInt(remain.ptr) - @ptrToInt(initial_remain.ptr));
+            std.debug.assert(total - initial == @intFromPtr(remain.ptr) - @intFromPtr(initial_remain.ptr));
 
             if (remain.len == 0) {
                 this.head = 0;
@@ -1908,15 +1912,15 @@ pub fn NewJSSink(comptime SinkType: type, comptime name_: []const u8) type {
             pub fn init(cpp: JSValue) Signal {
                 // this one can be null
                 @setRuntimeSafety(false);
-                return Signal.initWithType(SinkSignal, @intToPtr(*SinkSignal, @bitCast(usize, @enumToInt(cpp))));
+                return Signal.initWithType(SinkSignal, @ptrFromInt(*SinkSignal, @bitCast(usize, @intFromEnum(cpp))));
             }
 
             pub fn close(this: *@This(), _: ?Syscall.Error) void {
-                onClose(@bitCast(SinkSignal, @ptrToInt(this)).cpp, JSValue.jsUndefined());
+                onClose(@bitCast(SinkSignal, @intFromPtr(this)).cpp, JSValue.jsUndefined());
             }
 
             pub fn ready(this: *@This(), _: ?Blob.SizeType, _: ?Blob.SizeType) void {
-                onReady(@bitCast(SinkSignal, @ptrToInt(this)).cpp, JSValue.jsUndefined(), JSValue.jsUndefined());
+                onReady(@bitCast(SinkSignal, @intFromPtr(this)).cpp, JSValue.jsUndefined(), JSValue.jsUndefined());
             }
 
             pub fn start(_: *@This()) void {}
@@ -1960,10 +1964,10 @@ pub fn NewJSSink(comptime SinkType: type, comptime name_: []const u8) type {
                     pub const message = std.fmt.comptimePrint("{s} is not constructable", .{SinkType.name});
                 };
                 const err = JSC.SystemError{
-                    .message = ZigString.init(Static.message),
-                    .code = ZigString.init(@as(string, @tagName(JSC.Node.ErrorCode.ERR_ILLEGAL_CONSTRUCTOR))),
+                    .message = bun.String.static(Static.message),
+                    .code = bun.String.static(@as(string, @tagName(JSC.Node.ErrorCode.ERR_ILLEGAL_CONSTRUCTOR))),
                 };
-                globalThis.vm().throwError(globalThis, err.toErrorInstance(globalThis));
+                globalThis.throwValue(err.toErrorInstance(globalThis));
                 return JSC.JSValue.jsUndefined();
             }
 
@@ -1992,7 +1996,7 @@ pub fn NewJSSink(comptime SinkType: type, comptime name_: []const u8) type {
             if (this.sink.signal.isDead())
                 return;
             this.sink.signal.clear();
-            const value = @intToEnum(JSValue, @bitCast(JSC.JSValueReprInt, @ptrToInt(ptr)));
+            const value = @enumFromInt(JSValue, @bitCast(JSC.JSValueReprInt, @intFromPtr(ptr)));
             value.unprotect();
             detachPtr(value);
         }
@@ -3127,7 +3131,7 @@ pub const ByteBlobLoader = struct {
         this.remain -|= copied;
         this.offset +|= copied;
         std.debug.assert(buffer.ptr != temporary.ptr);
-        @memcpy(buffer.ptr, temporary.ptr, temporary.len);
+        @memcpy(buffer[0..temporary.len], temporary);
         if (this.remain == 0) {
             return .{ .into_array_and_done = .{ .value = array, .len = copied } };
         }
@@ -3231,7 +3235,7 @@ pub const ByteStream = struct {
         }
 
         if (this.has_received_last_chunk) {
-            return .{ .chunk_size = @truncate(Blob.SizeType, @min(1024 * 1024 * 2, this.buffer.items.len)) };
+            return .{ .chunk_size = @min(1024 * 1024 * 2, this.buffer.items.len) };
         }
 
         if (this.highWaterMark == 0) {
@@ -3292,7 +3296,7 @@ pub const ByteStream = struct {
             var to_copy = this.pending_buffer[0..@min(chunk.len, this.pending_buffer.len)];
             const pending_buffer_len = this.pending_buffer.len;
             std.debug.assert(to_copy.ptr != chunk.ptr);
-            @memcpy(to_copy.ptr, chunk.ptr, to_copy.len);
+            @memcpy(to_copy, chunk[0..to_copy.len]);
             this.pending_buffer = &.{};
 
             const is_really_done = this.has_received_last_chunk and to_copy.len <= pending_buffer_len;
@@ -3382,7 +3386,7 @@ pub const ByteStream = struct {
             );
             var remaining_in_buffer = this.buffer.items[this.offset..][0..to_write];
 
-            @memcpy(buffer.ptr, this.buffer.items.ptr + this.offset, to_write);
+            @memcpy(buffer[0..to_write], this.buffer.items[this.offset..][0..to_write]);
 
             if (this.offset + to_write == this.buffer.items.len) {
                 this.offset = 0;
@@ -4071,7 +4075,7 @@ pub const File = struct {
                     this.pending.result = .{
                         .err = Syscall.Error{
                             // this is too hacky
-                            .errno = @truncate(Syscall.Error.Int, @intCast(u16, @max(1, @errorToInt(err)))),
+                            .errno = @truncate(Syscall.Error.Int, @intCast(u16, @max(1, @intFromError(err)))),
                             .syscall = .read,
                         },
                     };
@@ -4655,4 +4659,3 @@ pub fn NewReadyWatcher(
 //         pub fn onError(this: *Streamer): anytype,
 //     };
 // }
-