60 files changed, 30288 insertions, 22741 deletions
diff --git a/src/bun.js/bindings/BunString.cpp b/src/bun.js/bindings/BunString.cpp
index f737342f4..21541d711 100644
--- a/src/bun.js/bindings/BunString.cpp
+++ b/src/bun.js/bindings/BunString.cpp
@@ -7,6 +7,11 @@
 #include "GCDefferalContext.h"
 using namespace JSC;
 
+extern "C" bool Bun__WTFStringImpl__hasPrefix(const WTF::StringImpl* impl, const char* bytes, size_t length)
+{
+    return impl->startsWith(bytes, length);
+}
+
 extern "C" void Bun__WTFStringImpl__deref(WTF::StringImpl* impl)
 {
     impl->deref();
@@ -81,31 +86,69 @@ BunString toString(JSC::JSGlobalObject* globalObject, JSValue value)
     return fromJS(globalObject, value);
 }
 
+BunString toStringRef(JSC::JSGlobalObject* globalObject, JSValue value)
+{
+    auto str = value.toWTFString(globalObject);
+    if (str.isEmpty()) {
+        return { BunStringTag::Empty };
+    }
+
+    str.impl()->ref();
+
+    return { BunStringTag::WTFStringImpl, { .wtf = str.impl() } };
+}
+
 BunString toString(WTF::String& wtfString)
 {
-    if (wtfString.length() == 0)
+    if (wtfString.isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
 }
 BunString toString(const WTF::String& wtfString)
 {
-    if (wtfString.length() == 0)
+    if (wtfString.isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
 }
 BunString toString(WTF::StringImpl* wtfString)
 {
-    if (wtfString->length() == 0)
+    if (wtfString->isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString } };
 }
 
+BunString toStringRef(WTF::String& wtfString)
+{
+    if (wtfString.isEmpty())
+        return { BunStringTag::Empty };
+
+    wtfString.impl()->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
+}
+BunString toStringRef(const WTF::String& wtfString)
+{
+    if (wtfString.isEmpty())
+        return { BunStringTag::Empty };
+
+    wtfString.impl()->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
+}
+BunString toStringRef(WTF::StringImpl* wtfString)
+{
+    if (wtfString->isEmpty())
+        return { BunStringTag::Empty };
+
+    wtfString->ref();
+
+    return { BunStringTag::WTFStringImpl, { .wtf = wtfString } };
+}
+
 BunString fromString(WTF::String& wtfString)
 {
-    if (wtfString.length() == 0)
+    if (wtfString.isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString.impl() } };
@@ -113,7 +156,7 @@ BunString fromString(WTF::String& wtfString)
 
 BunString fromString(WTF::StringImpl* wtfString)
 {
-    if (wtfString->length() == 0)
+    if (wtfString->isEmpty())
         return { BunStringTag::Empty };
 
     return { BunStringTag::WTFStringImpl, { .wtf = wtfString } };
@@ -126,6 +169,29 @@ extern "C" JSC::EncodedJSValue BunString__toJS(JSC::JSGlobalObject* globalObject
     return JSValue::encode(Bun::toJS(globalObject, *bunString));
 }
 
+extern "C" BunString BunString__fromUTF16Unitialized(size_t length)
+{
+    unsigned utf16Length = length;
+    UChar* ptr;
+    auto impl = WTF::StringImpl::createUninitialized(utf16Length, ptr);
+    if (UNLIKELY(!ptr))
+        return { BunStringTag::Dead };
+
+    impl->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = &impl.leakRef() } };
+}
+
+extern "C" BunString BunString__fromLatin1Unitialized(size_t length)
+{
+    unsigned latin1Length = length;
+    LChar* ptr;
+    auto impl = WTF::StringImpl::createUninitialized(latin1Length, ptr);
+    if (UNLIKELY(!ptr))
+        return { BunStringTag::Dead };
+    impl->ref();
+    return { BunStringTag::WTFStringImpl, { .wtf = &impl.leakRef() } };
+}
+
 extern "C" BunString BunString__fromUTF8(const char* bytes, size_t length)
 {
     if (simdutf::validate_utf8(bytes, length)) {
diff --git a/src/bun.js/bindings/CommonJSModuleRecord.cpp b/src/bun.js/bindings/CommonJSModuleRecord.cpp
index 1cee1091b..8adba197c 100644
--- a/src/bun.js/bindings/CommonJSModuleRecord.cpp
+++ b/src/bun.js/bindings/CommonJSModuleRecord.cpp
@@ -59,458 +59,914 @@
 
 #include <JavaScriptCore/DFGAbstractHeap.h>
 #include <JavaScriptCore/Completion.h>
+#include "ModuleLoader.h"
 #include <JavaScriptCore/JSMap.h>
 
 #include <JavaScriptCore/JSMapInlines.h>
 #include <JavaScriptCore/GetterSetter.h>
 #include "ZigSourceProvider.h"
+#include "JavaScriptCore/FunctionPrototype.h"
+#include "CommonJSModuleRecord.h"
+#include <JavaScriptCore/JSModuleNamespaceObject.h>
+#include <JavaScriptCore/JSSourceCode.h>
+#include <JavaScriptCore/LazyPropertyInlines.h>
 
 namespace Bun {
 using namespace JSC;
 
-class JSCommonJSModule final : public JSC::JSNonFinalObject {
-public:
-    using Base = JSC::JSNonFinalObject;
-    static constexpr unsigned StructureFlags = Base::StructureFlags | JSC::OverridesPut;
+JSC_DECLARE_HOST_FUNCTION(jsFunctionRequireCommonJS);
 
-    mutable JSC::WriteBarrier<JSC::Unknown> m_exportsObject;
-    mutable JSC::WriteBarrier<JSC::JSString> m_id;
+static bool canPerformFastEnumeration(Structure* s)
+{
+    if (s->typeInfo().overridesGetOwnPropertySlot())
+        return false;
+    if (s->typeInfo().overridesAnyFormOfGetOwnPropertyNames())
+        return false;
+    if (hasIndexedProperties(s->indexingType()))
+        return false;
+    if (s->hasAnyKindOfGetterSetterProperties())
+        return false;
+    if (s->isUncacheableDictionary())
+        return false;
+    if (s->hasUnderscoreProtoPropertyExcludingOriginalProto())
+        return false;
+    return true;
+}
 
-    void finishCreation(JSC::VM& vm, JSC::JSValue exportsObject, JSC::JSString* id, JSC::JSString* filename, JSC::JSString* dirname, JSC::JSValue requireFunction)
-    {
-        Base::finishCreation(vm);
-        ASSERT(inherits(vm, info()));
-        m_exportsObject.set(vm, this, exportsObject);
-        m_id.set(vm, this, id);
+static bool evaluateCommonJSModuleOnce(JSC::VM& vm, Zig::GlobalObject* globalObject, JSCommonJSModule* moduleObject, JSString* dirname, JSString* filename, WTF::NakedPtr<Exception>& exception)
+{
+    JSC::Structure* thisObjectStructure = globalObject->commonJSFunctionArgumentsStructure();
+    JSC::JSObject* thisObject = JSC::constructEmptyObject(
+        vm,
+        thisObjectStructure);
+    thisObject->putDirectOffset(
+        vm,
+        0,
+        moduleObject);
 
-        this->putDirectOffset(
-            vm,
-            0,
-            exportsObject);
+    thisObject->putDirectOffset(
+        vm,
+        1,
+        dirname);
 
-        this->putDirectOffset(
-            vm,
-            1,
-            id);
+    thisObject->putDirectOffset(
+        vm,
+        2,
+        filename);
 
-        this->putDirectOffset(
-            vm,
-            2,
-            filename);
+    moduleObject->hasEvaluated = true;
+    globalObject->m_BunCommonJSModuleValue.set(vm, globalObject, thisObject);
 
-        this->putDirectOffset(
-            vm,
-            3,
-            jsBoolean(false));
+    JSValue empty = JSC::evaluate(globalObject, moduleObject->sourceCode.get()->sourceCode(), thisObject, exception);
+    moduleObject->sourceCode.clear();
 
-        this->putDirectOffset(
-            vm,
-            4,
-            dirname);
+    return exception.get() == nullptr;
+}
 
-        this->putDirectOffset(
-            vm,
-            5,
-            jsUndefined());
+JSC_DEFINE_HOST_FUNCTION(jsFunctionLoadModule, (JSGlobalObject * lexicalGlobalObject, CallFrame* callframe))
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
+    JSCommonJSModule* moduleObject = jsDynamicCast<JSCommonJSModule*>(callframe->argument(0));
+    if (!moduleObject) {
+        RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(true)));
+    }
+
+    if (moduleObject->hasEvaluated || !moduleObject->sourceCode) {
+        RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(true)));
     }
 
-    static JSC::Structure* createStructure(
+    WTF::NakedPtr<Exception> exception;
+
+    evaluateCommonJSModuleOnce(
+        globalObject->vm(),
+        jsCast<Zig::GlobalObject*>(globalObject),
+        moduleObject,
+        moduleObject->m_dirname.get(),
+        moduleObject->m_filename.get(),
+        exception);
+
+    if (exception.get()) {
+        // On error, remove the module from the require map/
+        // so that it can be re-evaluated on the next require.
+        globalObject->requireMap()->remove(globalObject, moduleObject->id());
+
+        throwException(globalObject, throwScope, exception.get());
+        exception.clear();
+        return JSValue::encode({});
+    }
+
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsBoolean(true)));
+}
+
+JSC_DEFINE_HOST_FUNCTION(requireResolvePathsFunction, (JSGlobalObject * globalObject, CallFrame* callframe))
+{
+    return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr, 0));
+}
+
+static const HashTableValue RequireResolveFunctionPrototypeValues[] = {
+    { "paths"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, requireResolvePathsFunction, 1 } },
+};
+
+class RequireResolveFunctionPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static RequireResolveFunctionPrototype* create(
         JSC::JSGlobalObject* globalObject)
     {
         auto& vm = globalObject->vm();
-        JSC::Structure* structure = JSC::Structure::create(
-            vm,
-            globalObject,
-            globalObject->objectPrototype(),
-            JSC::TypeInfo(JSC::ObjectType, JSCommonJSModule::StructureFlags),
-            JSCommonJSModule::info(),
-            JSC::NonArray,
-            6);
 
-        JSC::PropertyOffset offset;
-        auto clientData = WebCore::clientData(vm);
+        auto* structure = RequireResolveFunctionPrototype::createStructure(vm, globalObject, globalObject->functionPrototype());
+        RequireResolveFunctionPrototype* prototype = new (NotNull, JSC::allocateCell<RequireResolveFunctionPrototype>(vm)) RequireResolveFunctionPrototype(vm, structure);
+        prototype->finishCreation(vm);
+        return prototype;
+    }
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "exports"_s),
-            0,
-            offset);
+    DECLARE_INFO;
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "id"_s),
-            0,
-            offset);
+    RequireResolveFunctionPrototype(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : Base(vm, structure)
+    {
+    }
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "filename"_s),
-            0,
-            offset);
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        return &vm.plainObjectSpace();
+    }
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "loaded"_s),
-            0,
-            offset);
+    void finishCreation(JSC::VM& vm);
+};
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "path"_s),
-            0,
-            offset);
+static const HashTableValue RequireFunctionPrototypeValues[] = {
+    { "cache"_s, static_cast<unsigned>(JSC::PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, Zig::jsRequireCacheGetter, Zig::jsRequireCacheSetter } },
+};
 
-        structure = structure->addPropertyTransition(
-            vm,
-            structure,
-            JSC::Identifier::fromString(vm, "require"_s),
-            0,
-            offset);
+class RequireFunctionPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static RequireFunctionPrototype* create(
+        JSC::JSGlobalObject* globalObject)
+    {
+        auto& vm = globalObject->vm();
 
-        return structure;
+        auto* structure = RequireFunctionPrototype::createStructure(vm, globalObject, globalObject->functionPrototype());
+        RequireFunctionPrototype* prototype = new (NotNull, JSC::allocateCell<RequireFunctionPrototype>(vm)) RequireFunctionPrototype(vm, structure);
+        prototype->finishCreation(vm);
+
+        JSFunction* resolveFunction = JSFunction::create(vm, moduleRequireResolveCodeGenerator(vm), globalObject->globalScope(), JSFunction::createStructure(vm, globalObject, RequireResolveFunctionPrototype::create(globalObject)));
+        prototype->putDirect(vm, JSC::Identifier::fromString(vm, "resolve"_s), resolveFunction, PropertyAttribute::Function | 0);
+
+        return prototype;
     }
 
-    static JSCommonJSModule* create(
+    RequireFunctionPrototype(
         JSC::VM& vm,
-        JSC::Structure* structure,
-        JSC::JSValue exportsObject,
-        JSC::JSString* id,
-        JSC::JSString* filename,
-        JSC::JSString* dirname,
-        JSC::JSValue requireFunction)
+        JSC::Structure* structure)
+        : Base(vm, structure)
     {
-        JSCommonJSModule* cell = new (NotNull, JSC::allocateCell<JSCommonJSModule>(vm)) JSCommonJSModule(vm, structure);
-        cell->finishCreation(vm, exportsObject, id, filename, dirname, requireFunction);
-        return cell;
     }
 
-    JSValue exportsObject()
+    DECLARE_INFO;
+
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
-        return m_exportsObject.get();
+        return &vm.plainObjectSpace();
     }
 
-    JSValue id()
+    void finishCreation(JSC::VM& vm)
     {
-        return m_id.get();
+        Base::finishCreation(vm);
+        ASSERT(inherits(vm, info()));
+
+        reifyStaticProperties(vm, info(), RequireFunctionPrototypeValues, *this);
+        JSC::JSFunction* requireDotMainFunction = JSFunction::create(
+            vm,
+            moduleMainCodeGenerator(vm),
+            globalObject()->globalScope());
+
+        this->putDirect(
+            vm,
+            JSC::Identifier::fromString(vm, "main"_s),
+            JSC::GetterSetter::create(vm, globalObject(), requireDotMainFunction, JSValue()),
+            PropertyAttribute::Builtin | PropertyAttribute::Accessor | PropertyAttribute::ReadOnly | 0);
+        this->putDirect(vm, JSC::Identifier::fromString(vm, "extensions"_s), constructEmptyObject(globalObject()), 0);
     }
+};
 
-    DECLARE_VISIT_CHILDREN;
+JSC_DEFINE_CUSTOM_GETTER(getterFilename, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
+    return JSValue::encode(thisObject->m_filename.get());
+}
+JSC_DEFINE_CUSTOM_GETTER(getterId, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
+    return JSValue::encode(thisObject->m_id.get());
+}
 
-    static bool put(
-        JSC::JSCell* cell,
-        JSC::JSGlobalObject* globalObject,
-        JSC::PropertyName propertyName,
-        JSC::JSValue value,
-        JSC::PutPropertySlot& slot)
-    {
+JSC_DEFINE_CUSTOM_GETTER(getterPath, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
+    return JSValue::encode(thisObject->m_id.get());
+}
 
-        auto& vm = globalObject->vm();
-        auto* clientData = WebCore::clientData(vm);
-        auto throwScope = DECLARE_THROW_SCOPE(vm);
+JSC_DEFINE_CUSTOM_SETTER(setterPath,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
 
-        if (propertyName == clientData->builtinNames().exportsPublicName()) {
-            JSCommonJSModule* thisObject = jsCast<JSCommonJSModule*>(cell);
-            ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    thisObject->m_id.set(globalObject->vm(), thisObject, JSValue::decode(value).toString(globalObject));
+    return true;
+}
 
-            // It will crash if we attempt to assign Object.defineProperty() result to a JSMap*.
-            if (UNLIKELY(slot.thisValue() != thisObject))
-                RELEASE_AND_RETURN(throwScope, JSObject::definePropertyOnReceiver(globalObject, propertyName, value, slot));
+extern "C" EncodedJSValue Resolver__propForRequireMainPaths(JSGlobalObject*);
 
-            JSValue prevValue = thisObject->m_exportsObject.get();
+JSC_DEFINE_CUSTOM_GETTER(getterPaths, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject)) {
+        return JSValue::encode(jsUndefined());
+    }
 
-            // TODO: refactor this to not go through ESM path and we don't need to do this check.
-            // IF we do this on every call, it causes GC to happen in a place that it may not be able to.
-            // This breaks loading Bluebird in some cases, for example.
-            // We need to update the require map "live" because otherwise the code in Discord.js will break
-            // The bug is something to do with exception handling which causes GC to happen in the error path and then boom.
-            if (prevValue != value && (!prevValue.isCell() || !value.isCell() || prevValue.asCell()->type() != value.asCell()->type())) {
-                jsCast<Zig::GlobalObject*>(globalObject)->requireMap()->set(globalObject, thisObject->id(), value);
-            }
+    if (!thisObject->m_paths) {
+        JSValue paths = JSValue::decode(Resolver__propForRequireMainPaths(globalObject));
+        thisObject->m_paths.set(globalObject->vm(), thisObject, paths);
+    }
 
-            thisObject->m_exportsObject.set(vm, thisObject, value);
-        }
+    return JSValue::encode(thisObject->m_paths.get());
+}
 
-        RELEASE_AND_RETURN(throwScope, Base::put(cell, globalObject, propertyName, value, slot));
+JSC_DEFINE_CUSTOM_SETTER(setterPaths,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->m_paths.set(globalObject->vm(), thisObject, JSValue::decode(value));
+    return true;
+}
+
+JSC_DEFINE_CUSTOM_SETTER(setterFilename,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->m_filename.set(globalObject->vm(), thisObject, JSValue::decode(value).toString(globalObject));
+    return true;
+}
+
+JSC_DEFINE_CUSTOM_SETTER(setterId,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->m_id.set(globalObject->vm(), thisObject, JSValue::decode(value).toString(globalObject));
+    return true;
+}
+
+static JSValue createLoaded(VM& vm, JSObject* object)
+{
+    JSCommonJSModule* cjs = jsCast<JSCommonJSModule*>(object);
+    return jsBoolean(cjs->hasEvaluated);
+}
+static JSValue createParent(VM& vm, JSObject* object)
+{
+    return jsUndefined();
+}
+static JSValue createChildren(VM& vm, JSObject* object)
+{
+    return constructEmptyArray(object->globalObject(), nullptr, 0);
+}
+
+static const struct HashTableValue JSCommonJSModulePrototypeTableValues[] = {
+    { "children"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback | PropertyAttribute::DontEnum | 0), NoIntrinsic, { HashTableValue::LazyPropertyType, createChildren } },
+    { "filename"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterFilename, setterFilename } },
+    { "id"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterId, setterId } },
+    { "loaded"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback | PropertyAttribute::DontEnum | 0), NoIntrinsic, { HashTableValue::LazyPropertyType, createLoaded } },
+    { "parent"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback | PropertyAttribute::DontEnum | 0), NoIntrinsic, { HashTableValue::LazyPropertyType, createParent } },
+    { "path"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterPath, setterPath } },
+    { "paths"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, getterPaths, setterPaths } },
+};
+
+class JSCommonJSModulePrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static JSCommonJSModulePrototype* create(
+        JSC::VM& vm,
+        JSC::JSGlobalObject* globalObject,
+        JSC::Structure* structure)
+    {
+        JSCommonJSModulePrototype* prototype = new (NotNull, JSC::allocateCell<JSCommonJSModulePrototype>(vm)) JSCommonJSModulePrototype(vm, structure);
+        prototype->finishCreation(vm, globalObject);
+        return prototype;
     }
 
     DECLARE_INFO;
-    template<typename, SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+
+    JSCommonJSModulePrototype(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : Base(vm, structure)
     {
-        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
-            return nullptr;
-        return WebCore::subspaceForImpl<JSCommonJSModule, WebCore::UseCustomHeapCellType::No>(
-            vm,
-            [](auto& spaces) { return spaces.m_clientSubspaceForCommonJSModuleRecord.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); },
-            [](auto& spaces) { return spaces.m_subspaceForCommonJSModuleRecord.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_subspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); });
     }
 
-    JSCommonJSModule(JSC::VM& vm, JSC::Structure* structure)
-        : Base(vm, structure)
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        return &vm.plainObjectSpace();
+    }
+
+    void finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
     {
+        Base::finishCreation(vm);
+        ASSERT(inherits(vm, info()));
+        reifyStaticProperties(vm, JSCommonJSModule::info(), JSCommonJSModulePrototypeTableValues, *this);
+
+        JSFunction* requireFunction = JSFunction::create(
+            vm,
+            moduleRequireCodeGenerator(vm),
+            globalObject->globalScope(),
+            JSFunction::createStructure(vm, globalObject, RequireFunctionPrototype::create(globalObject)));
+
+        this->putDirect(vm, clientData(vm)->builtinNames().requirePublicName(), requireFunction, PropertyAttribute::Builtin | PropertyAttribute::Function | 0);
+
+        this->putDirectNativeFunction(
+            vm,
+            globalObject,
+            clientData(vm)->builtinNames().requirePrivateName(),
+            2,
+            jsFunctionRequireCommonJS, ImplementationVisibility::Public, NoIntrinsic, JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::DontDelete | 0);
     }
 };
 
-Structure* createCommonJSModuleStructure(
-    Zig::GlobalObject* globalObject)
+const JSC::ClassInfo JSCommonJSModulePrototype::s_info = { "Module"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSCommonJSModulePrototype) };
+
+void JSCommonJSModule::finishCreation(JSC::VM& vm, JSC::JSString* id, JSC::JSString* filename, JSC::JSString* dirname, JSC::JSSourceCode* sourceCode)
 {
-    return JSCommonJSModule::createStructure(globalObject);
+    Base::finishCreation(vm);
+    ASSERT(inherits(vm, info()));
+    m_id.set(vm, this, id);
+    m_filename.set(vm, this, filename);
+    m_dirname.set(vm, this, dirname);
+    this->sourceCode.set(vm, this, sourceCode);
 }
 
-template<typename Visitor>
-void JSCommonJSModule::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+JSC::Structure* JSCommonJSModule::createStructure(
+    JSC::JSGlobalObject* globalObject)
 {
-    JSCommonJSModule* thisObject = jsCast<JSCommonJSModule*>(cell);
-    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
-    Base::visitChildren(thisObject, visitor);
-    visitor.append(thisObject->m_exportsObject);
-    visitor.append(thisObject->m_id);
-}
+    auto& vm = globalObject->vm();
 
-DEFINE_VISIT_CHILDREN(JSCommonJSModule);
-const JSC::ClassInfo JSCommonJSModule::s_info = { "Module"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSCommonJSModule) };
+    auto* prototype = JSCommonJSModulePrototype::create(vm, globalObject, JSCommonJSModulePrototype::createStructure(vm, globalObject, globalObject->objectPrototype()));
 
-static bool canPerformFastEnumeration(Structure* s)
+    // Do not set the number of inline properties on this structure
+    // there may be an off-by-one error in the Structure which causes `require.id` to become the require
+    return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info(), NonArray);
+}
+
+JSCommonJSModule* JSCommonJSModule::create(
+    JSC::VM& vm,
+    JSC::Structure* structure,
+    JSC::JSString* id,
+    JSC::JSString* filename,
+    JSC::JSString* dirname,
+    JSC::JSSourceCode* sourceCode)
 {
-    if (s->typeInfo().overridesGetOwnPropertySlot())
-        return false;
-    if (s->typeInfo().overridesAnyFormOfGetOwnPropertyNames())
-        return false;
-    if (hasIndexedProperties(s->indexingType()))
-        return false;
-    if (s->hasAnyKindOfGetterSetterProperties())
-        return false;
-    if (s->isUncacheableDictionary())
-        return false;
-    if (s->hasUnderscoreProtoPropertyExcludingOriginalProto())
-        return false;
-    return true;
+    JSCommonJSModule* cell = new (NotNull, JSC::allocateCell<JSCommonJSModule>(vm)) JSCommonJSModule(vm, structure);
+    cell->finishCreation(vm, id, filename, dirname, sourceCode);
+    return cell;
 }
 
-JSValue evaluateCommonJSModule(
-    Zig::GlobalObject* globalObject,
-    Ref<Zig::SourceProvider> sourceProvider,
-    const WTF::String& sourceURL,
-    ResolvedSource source)
+JSC_DEFINE_HOST_FUNCTION(jsFunctionCreateCommonJSModule, (JSGlobalObject * globalObject, CallFrame* callframe))
 {
     auto& vm = globalObject->vm();
 
-    auto throwScope = DECLARE_THROW_SCOPE(vm);
-    auto* requireMapKey = jsString(vm, sourceURL);
+    auto id = callframe->argument(0).toWTFString(globalObject);
+
+    JSValue object = callframe->argument(1);
+
+    return JSValue::encode(
+        JSCommonJSModule::create(
+            jsCast<Zig::GlobalObject*>(globalObject),
+            id,
+            object, callframe->argument(2).isBoolean() && callframe->argument(2).asBoolean()));
+}
 
-    JSC::JSObject* exportsObject = source.commonJSExportsLen < 64
-        ? JSC::constructEmptyObject(globalObject, globalObject->objectPrototype(), source.commonJSExportsLen)
-        : JSC::constructEmptyObject(globalObject, globalObject->objectPrototype());
-    auto index = sourceURL.reverseFind('/', sourceURL.length());
+JSCommonJSModule* JSCommonJSModule::create(
+    Zig::GlobalObject* globalObject,
+    const WTF::String& key,
+    JSValue exportsObject,
+    bool hasEvaluated)
+{
+    auto& vm = globalObject->vm();
+    JSString* requireMapKey = JSC::jsStringWithCache(vm, key);
+    auto index = key.reverseFind('/', key.length());
     JSString* dirname = jsEmptyString(vm);
     JSString* filename = requireMapKey;
     if (index != WTF::notFound) {
         dirname = JSC::jsSubstring(globalObject, requireMapKey, 0, index);
     }
 
-    globalObject->requireMap()->set(globalObject, requireMapKey, exportsObject);
-    auto* requireFunction = Zig::ImportMetaObject::createRequireFunction(vm, globalObject, sourceURL);
-
-    JSC::SourceCode inputSource(
-        WTFMove(sourceProvider));
-
-    auto* moduleObject = JSCommonJSModule::create(
+    auto* out = JSCommonJSModule::create(
         vm,
         globalObject->CommonJSModuleObjectStructure(),
-        exportsObject,
-        requireMapKey, filename, dirname, requireFunction);
+        requireMapKey, filename, dirname, nullptr);
 
-    if (UNLIKELY(throwScope.exception())) {
-        globalObject->requireMap()->remove(globalObject, requireMapKey);
-        RELEASE_AND_RETURN(throwScope, JSValue());
-    }
+    out->putDirect(vm, WebCore::clientData(vm)->builtinNames().exportsPublicName(), exportsObject, exportsObject.isCell() && exportsObject.isCallable() ? JSC::PropertyAttribute::Function | 0 : 0);
+    out->hasEvaluated = hasEvaluated;
+    return out;
+}
 
-    JSC::Structure* thisObjectStructure = globalObject->commonJSFunctionArgumentsStructure();
-    JSC::JSObject* thisObject = JSC::constructEmptyObject(
-        vm,
-        thisObjectStructure);
-    thisObject->putDirectOffset(
-        vm,
-        0,
-        moduleObject);
+void JSCommonJSModule::destroy(JSC::JSCell* cell)
+{
+    static_cast<JSCommonJSModule*>(cell)->JSCommonJSModule::~JSCommonJSModule();
+}
 
-    thisObject->putDirectOffset(
-        vm,
-        1,
-        exportsObject);
+JSCommonJSModule::~JSCommonJSModule()
+{
+}
 
-    thisObject->putDirectOffset(
-        vm,
-        2,
-        dirname);
+bool JSCommonJSModule::evaluate(
+    Zig::GlobalObject* globalObject,
+    const WTF::String& key,
+    const SyntheticSourceProvider::SyntheticSourceGenerator& generator)
+{
+    Vector<JSC::Identifier, 4> propertyNames;
+    JSC::MarkedArgumentBuffer arguments;
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    generator(globalObject, JSC::Identifier::fromString(vm, key), propertyNames, arguments);
+    RETURN_IF_EXCEPTION(throwScope, false);
+
+    bool needsPut = false;
+    auto getDefaultValue = [&]() -> JSValue {
+        size_t defaultValueIndex = propertyNames.find(vm.propertyNames->defaultKeyword);
+        auto cjsSymbol = Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s));
+
+        if (defaultValueIndex != notFound && propertyNames.contains(cjsSymbol)) {
+            JSValue current = arguments.at(defaultValueIndex);
+            needsPut = true;
+            return current;
+        }
 
-    thisObject->putDirectOffset(
-        vm,
-        3,
-        filename);
+        size_t count = propertyNames.size();
+        JSValue existingDefaultObject = this->getIfPropertyExists(globalObject, WebCore::clientData(vm)->builtinNames().exportsPublicName());
+        JSObject* defaultObject;
 
-    thisObject->putDirectOffset(
-        vm,
-        4,
-        requireFunction);
+        if (existingDefaultObject && existingDefaultObject.isObject()) {
+            defaultObject = jsCast<JSObject*>(existingDefaultObject);
+        } else {
+            defaultObject = JSC::constructEmptyObject(globalObject, globalObject->objectPrototype());
+            needsPut = true;
+        }
 
-    {
-        WTF::NakedPtr<Exception> exception;
-        globalObject->m_BunCommonJSModuleValue.set(vm, globalObject, thisObject);
-        JSC::evaluate(globalObject, inputSource, globalObject->globalThis(), exception);
-
-        if (exception.get()) {
-            throwScope.throwException(globalObject, exception->value());
-            exception.clear();
-            RELEASE_AND_RETURN(throwScope, JSValue());
+        for (size_t i = 0; i < count; ++i) {
+            auto prop = propertyNames[i];
+            unsigned attributes = 0;
+
+            JSValue value = arguments.at(i);
+
+            if (prop.isSymbol()) {
+                attributes |= JSC::PropertyAttribute::DontEnum;
+            }
+
+            if (value.isCell() && value.isCallable()) {
+                attributes |= JSC::PropertyAttribute::Function;
+            }
+
+            defaultObject->putDirect(vm, prop, value, attributes);
+        }
+
+        return defaultObject;
+    };
+
+    JSValue defaultValue = getDefaultValue();
+    if (needsPut) {
+        unsigned attributes = 0;
+
+        if (defaultValue.isCell() && defaultValue.isCallable()) {
+            attributes |= JSC::PropertyAttribute::Function;
         }
-    }
 
-    if (UNLIKELY(throwScope.exception())) {
-        globalObject->requireMap()->remove(globalObject, requireMapKey);
-        RELEASE_AND_RETURN(throwScope, JSValue());
+        this->putDirect(vm, WebCore::clientData(vm)->builtinNames().exportsPublicName(), defaultValue, attributes);
     }
 
-    JSValue result = moduleObject->exportsObject();
+    this->hasEvaluated = true;
+    RELEASE_AND_RETURN(throwScope, true);
+}
+
+void JSCommonJSModule::toSyntheticSource(JSC::JSGlobalObject* globalObject,
+    JSC::Identifier moduleKey,
+    Vector<JSC::Identifier, 4>& exportNames,
+    JSC::MarkedArgumentBuffer& exportValues)
+{
+    auto result = this->exportsObject();
+
+    auto& vm = globalObject->vm();
+
+    // This exists to tell ImportMetaObject.ts that this is a CommonJS module.
+    exportNames.append(Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s)));
+    exportValues.append(jsNumber(0));
 
-    // The developer can do something like:
+    // Bun's intepretation of the "__esModule" annotation:
     //
-    //   Object.defineProperty(module, 'exports', {get: getter})
+    //   - If a "default" export does not exist OR the __esModule annotation is not present, then we
+    //   set the default export to the exports object
     //
-    // In which case, the exports object is now a GetterSetter object.
+    //   - If a "default" export also exists, then we set the default export
+    //   to the value of it (matching Babel behavior)
     //
-    // We can't return a GetterSetter object to ESM code, so we need to call it.
-    if (!result.isEmpty() && (result.isGetterSetter() || result.isCustomGetterSetter())) {
-        auto* clientData = WebCore::clientData(vm);
-
-        // TODO: is there a faster way to call these getters? We shouldn't need to do a full property lookup.
-        //
-        // we use getIfPropertyExists just incase a pathological devleoper did:
-        //
-        //   - Object.defineProperty(module, 'exports', {get: getter})
-        //   - delete module.exports
-        //
-        if (result.isGetterSetter()) {
-            JSC::GetterSetter* getter = jsCast<JSC::GetterSetter*>(result);
-            result = getter->callGetter(globalObject, moduleObject);
-        } else {
-            result = moduleObject->getIfPropertyExists(globalObject, clientData->builtinNames().exportsPublicName());
+    // https://stackoverflow.com/questions/50943704/whats-the-purpose-of-object-definepropertyexports-esmodule-value-0
+    // https://github.com/nodejs/node/issues/40891
+    // https://github.com/evanw/bundler-esm-cjs-tests
+    // https://github.com/evanw/esbuild/issues/1591
+    // https://github.com/oven-sh/bun/issues/3383
+    //
+    // Note that this interpretation is slightly different
+    //
+    //    -  We do not ignore when "type": "module" or when the file
+    //       extension is ".mjs". Build tools determine that based on the
+    //       caller's behavior, but in a JS runtime, there is only one ModuleNamespaceObject.
+    //
+    //       It would be possible to match the behavior at runtime, but
+    //       it would need further engine changes which do not match the ES Module spec
+    //
+    //   -   We ignore the value of the annotation. We only look for the
+    //       existence of the value being set. This is for performance reasons, but also
+    //       this annotation is meant for tooling and the only usages of setting
+    //       it to something that does NOT evaluate to "true" I could find were in
+    //       unit tests of build tools. Happy to revisit this if users file an issue.
+    bool needsToAssignDefault = true;
+
+    if (result.isObject()) {
+        auto* exports = asObject(result);
+
+        auto* structure = exports->structure();
+        uint32_t size = structure->inlineSize() + structure->outOfLineSize();
+        exportNames.reserveCapacity(size + 2);
+        exportValues.ensureCapacity(size + 2);
+
+        auto catchScope = DECLARE_CATCH_SCOPE(vm);
+
+        Identifier esModuleMarker = builtinNames(vm).__esModulePublicName();
+        bool hasESModuleMarker = !this->ignoreESModuleAnnotation && exports->hasProperty(globalObject, esModuleMarker);
+        if (catchScope.exception()) {
+            catchScope.clearException();
         }
 
-        if (UNLIKELY(throwScope.exception())) {
-            // Unlike getters on properties of the exports object
-            // When the exports object itself is a getter and it throws
-            // There's not a lot we can do
-            // so we surface that error
-            globalObject->requireMap()->remove(globalObject, requireMapKey);
-            RELEASE_AND_RETURN(throwScope, JSValue());
+        if (hasESModuleMarker) {
+            if (canPerformFastEnumeration(structure)) {
+                exports->structure()->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
+                    auto key = entry.key();
+                    if (key->isSymbol() || entry.attributes() & PropertyAttribute::DontEnum || key == esModuleMarker)
+                        return true;
+
+                    needsToAssignDefault = needsToAssignDefault && key != vm.propertyNames->defaultKeyword;
+
+                    JSValue value = exports->getDirect(entry.offset());
+                    exportNames.append(Identifier::fromUid(vm, key));
+                    exportValues.append(value);
+                    return true;
+                });
+            } else {
+                JSC::PropertyNameArray properties(vm, JSC::PropertyNameMode::Strings, JSC::PrivateSymbolMode::Exclude);
+                exports->methodTable()->getOwnPropertyNames(exports, globalObject, properties, DontEnumPropertiesMode::Exclude);
+                if (catchScope.exception()) {
+                    catchScope.clearExceptionExceptTermination();
+                    return;
+                }
+
+                for (auto property : properties) {
+                    if (UNLIKELY(property.isEmpty() || property.isNull() || property == esModuleMarker || property.isPrivateName() || property.isSymbol()))
+                        continue;
+
+                    // ignore constructor
+                    if (property == vm.propertyNames->constructor)
+                        continue;
+
+                    JSC::PropertySlot slot(exports, PropertySlot::InternalMethodType::Get);
+                    if (!exports->getPropertySlot(globalObject, property, slot))
+                        continue;
+
+                    exportNames.append(property);
+
+                    JSValue getterResult = slot.getValue(globalObject, property);
+
+                    // If it throws, we keep them in the exports list, but mark it as undefined
+                    // This is consistent with what Node.js does.
+                    if (catchScope.exception()) {
+                        catchScope.clearException();
+                        getterResult = jsUndefined();
+                    }
+
+                    exportValues.append(getterResult);
+
+                    needsToAssignDefault = needsToAssignDefault && property != vm.propertyNames->defaultKeyword;
+                }
+            }
+
+        } else if (canPerformFastEnumeration(structure)) {
+            exports->structure()->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
+                auto key = entry.key();
+                if (key->isSymbol() || entry.attributes() & PropertyAttribute::DontEnum || key == vm.propertyNames->defaultKeyword)
+                    return true;
+
+                JSValue value = exports->getDirect(entry.offset());
+
+                exportNames.append(Identifier::fromUid(vm, key));
+                exportValues.append(value);
+                return true;
+            });
+        } else {
+            JSC::PropertyNameArray properties(vm, JSC::PropertyNameMode::Strings, JSC::PrivateSymbolMode::Exclude);
+            exports->methodTable()->getOwnPropertyNames(exports, globalObject, properties, DontEnumPropertiesMode::Exclude);
+            if (catchScope.exception()) {
+                catchScope.clearExceptionExceptTermination();
+                return;
+            }
+
+            for (auto property : properties) {
+                if (UNLIKELY(property.isEmpty() || property.isNull() || property == vm.propertyNames->defaultKeyword || property.isPrivateName() || property.isSymbol()))
+                    continue;
+
+                // ignore constructor
+                if (property == vm.propertyNames->constructor)
+                    continue;
+
+                JSC::PropertySlot slot(exports, PropertySlot::InternalMethodType::Get);
+                if (!exports->getPropertySlot(globalObject, property, slot))
+                    continue;
+
+                exportNames.append(property);
+
+                JSValue getterResult = slot.getValue(globalObject, property);
+
+                // If it throws, we keep them in the exports list, but mark it as undefined
+                // This is consistent with what Node.js does.
+                if (catchScope.exception()) {
+                    catchScope.clearException();
+                    getterResult = jsUndefined();
+                }
+
+                exportValues.append(getterResult);
+            }
         }
     }
 
-    globalObject->requireMap()->set(globalObject, requireMapKey, result);
+    if (needsToAssignDefault) {
+        exportNames.append(vm.propertyNames->defaultKeyword);
+        exportValues.append(result);
+    }
+}
+
+JSValue JSCommonJSModule::exportsObject()
+{
+    return this->get(globalObject(), JSC::PropertyName(clientData(vm())->builtinNames().exportsPublicName()));
+}
 
-    return result;
+JSValue JSCommonJSModule::id()
+{
+    return m_id.get();
+}
+
+bool JSCommonJSModule::put(
+    JSC::JSCell* cell,
+    JSC::JSGlobalObject* globalObject,
+    JSC::PropertyName propertyName,
+    JSC::JSValue value,
+    JSC::PutPropertySlot& slot)
+{
+
+    auto& vm = globalObject->vm();
+    auto* clientData = WebCore::clientData(vm);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    RELEASE_AND_RETURN(throwScope, Base::put(cell, globalObject, propertyName, value, slot));
+}
+
+template<typename, SubspaceAccess mode> JSC::GCClient::IsoSubspace* JSCommonJSModule::subspaceFor(JSC::VM& vm)
+{
+    if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+        return nullptr;
+    return WebCore::subspaceForImpl<JSCommonJSModule, WebCore::UseCustomHeapCellType::No>(
+        vm,
+        [](auto& spaces) { return spaces.m_clientSubspaceForCommonJSModuleRecord.get(); },
+        [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); },
+        [](auto& spaces) { return spaces.m_subspaceForCommonJSModuleRecord.get(); },
+        [](auto& spaces, auto&& space) { spaces.m_subspaceForCommonJSModuleRecord = std::forward<decltype(space)>(space); });
+}
+
+Structure* createCommonJSModuleStructure(
+    Zig::GlobalObject* globalObject)
+{
+    return JSCommonJSModule::createStructure(globalObject);
 }
 
-JSC::SourceCode createCommonJSModule(
+template<typename Visitor>
+void JSCommonJSModule::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    JSCommonJSModule* thisObject = jsCast<JSCommonJSModule*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    Base::visitChildren(thisObject, visitor);
+    visitor.append(thisObject->m_id);
+    visitor.append(thisObject->sourceCode);
+    visitor.append(thisObject->m_filename);
+    visitor.append(thisObject->m_dirname);
+    visitor.append(thisObject->m_paths);
+}
+
+DEFINE_VISIT_CHILDREN(JSCommonJSModule);
+const JSC::ClassInfo JSCommonJSModule::s_info = { "Module"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSCommonJSModule) };
+const JSC::ClassInfo RequireResolveFunctionPrototype::s_info = { "resolve"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(RequireResolveFunctionPrototype) };
+const JSC::ClassInfo RequireFunctionPrototype::s_info = { "require"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(RequireFunctionPrototype) };
+
+JSC_DEFINE_HOST_FUNCTION(jsFunctionRequireCommonJS, (JSGlobalObject * lexicalGlobalObject, CallFrame* callframe))
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    JSCommonJSModule* thisObject = jsDynamicCast<JSCommonJSModule*>(callframe->thisValue());
+    if (!thisObject)
+        return throwVMTypeError(globalObject, throwScope);
+
+    JSValue specifierValue = callframe->argument(0);
+    WTF::String specifier = specifierValue.toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+
+    // Special-case for "process" to just return the process object directly.
+    if (UNLIKELY(specifier == "process"_s || specifier == "node:process"_s)) {
+        jsDynamicCast<JSCommonJSModule*>(callframe->argument(1))->putDirect(vm, builtinNames(vm).exportsPublicName(), globalObject->processObject(), 0);
+        return JSValue::encode(globalObject->processObject());
+    }
+
+    WTF::String referrer = thisObject->id().toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+
+    BunString specifierStr = Bun::toString(specifier);
+    BunString referrerStr = Bun::toString(referrer);
+
+    JSValue fetchResult = Bun::fetchCommonJSModule(
+        globalObject,
+        jsDynamicCast<JSCommonJSModule*>(callframe->argument(1)),
+        specifierValue,
+        &specifierStr,
+        &referrerStr);
+
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(fetchResult));
+}
+
+void RequireResolveFunctionPrototype::finishCreation(JSC::VM& vm)
+{
+    Base::finishCreation(vm);
+    ASSERT(inherits(vm, info()));
+
+    reifyStaticProperties(vm, RequireResolveFunctionPrototype::info(), RequireResolveFunctionPrototypeValues, *this);
+    JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+}
+
+bool JSCommonJSModule::evaluate(
     Zig::GlobalObject* globalObject,
+    const WTF::String& key,
     ResolvedSource source)
 {
-    auto sourceURL = Zig::toStringCopy(source.source_url);
-    auto sourceProvider = Zig::SourceProvider::create(globalObject, source, JSC::SourceProviderSourceType::Program);
+    auto& vm = globalObject->vm();
+    auto sourceProvider = Zig::SourceProvider::create(jsCast<Zig::GlobalObject*>(globalObject), source, JSC::SourceProviderSourceType::Program);
+    this->ignoreESModuleAnnotation = source.tag == ResolvedSourceTagPackageJSONTypeModule;
+    JSC::SourceCode rawInputSource(
+        WTFMove(sourceProvider));
 
-    return JSC::SourceCode(
-        JSC::SyntheticSourceProvider::create(
-            [source, sourceProvider = WTFMove(sourceProvider), sourceURL](JSC::JSGlobalObject* globalObject,
-                JSC::Identifier moduleKey,
-                Vector<JSC::Identifier, 4>& exportNames,
-                JSC::MarkedArgumentBuffer& exportValues) -> void {
-                JSValue result = evaluateCommonJSModule(
-                    jsCast<Zig::GlobalObject*>(globalObject),
-                    WTFMove(sourceProvider),
-                    sourceURL,
-                    source);
+    if (this->hasEvaluated)
+        return true;
 
-                if (!result) {
-                    return;
-                }
+    this->sourceCode.set(vm, this, JSC::JSSourceCode::create(vm, WTFMove(rawInputSource)));
 
-                auto& vm = globalObject->vm();
+    WTF::NakedPtr<JSC::Exception> exception;
 
-                exportNames.append(vm.propertyNames->defaultKeyword);
-                exportValues.append(result);
-
-                // This exists to tell ImportMetaObject.ts that this is a CommonJS module.
-                exportNames.append(Identifier::fromUid(vm.symbolRegistry().symbolForKey("CommonJS"_s)));
-                exportValues.append(jsNumber(0));
-
-                if (result.isObject()) {
-                    DeferGCForAWhile deferGC(vm);
-                    auto* exports = asObject(result);
-
-                    auto* structure = exports->structure();
-                    uint32_t size = structure->inlineSize() + structure->outOfLineSize();
-                    exportNames.reserveCapacity(size + 2);
-                    exportValues.ensureCapacity(size + 2);
-
-                    if (canPerformFastEnumeration(structure)) {
-                        exports->structure()->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
-                            auto key = entry.key();
-                            if (key->isSymbol() || key == vm.propertyNames->defaultKeyword || entry.attributes() & PropertyAttribute::DontEnum)
-                                return true;
-
-                            exportNames.append(Identifier::fromUid(vm, key));
-
-                            JSValue value = exports->getDirect(entry.offset());
-
-                            exportValues.append(value);
-                            return true;
-                        });
-                    } else {
-                        auto catchScope = DECLARE_CATCH_SCOPE(vm);
-                        JSC::PropertyNameArray properties(vm, JSC::PropertyNameMode::Strings, JSC::PrivateSymbolMode::Exclude);
-                        exports->methodTable()->getOwnPropertyNames(exports, globalObject, properties, DontEnumPropertiesMode::Exclude);
-                        if (catchScope.exception()) {
-                            catchScope.clearExceptionExceptTermination();
-                            return;
-                        }
+    evaluateCommonJSModuleOnce(vm, globalObject, this, this->m_dirname.get(), this->m_filename.get(), exception);
+
+    if (exception.get()) {
+        // On error, remove the module from the require map/
+        // so that it can be re-evaluated on the next require.
+        globalObject->requireMap()->remove(globalObject, this->id());
+
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        throwException(globalObject, throwScope, exception.get());
+        exception.clear();
+
+        return false;
+    }
+
+    return true;
+}
+
+std::optional<JSC::SourceCode> createCommonJSModule(
+    Zig::GlobalObject* globalObject,
+    ResolvedSource source)
+{
+    JSCommonJSModule* moduleObject;
+    WTF::String sourceURL = toStringCopy(source.source_url);
+
+    JSValue specifierValue = Bun::toJS(globalObject, source.specifier);
+    JSValue entry = globalObject->requireMap()->get(globalObject, specifierValue);
 
-                        for (auto property : properties) {
-                            if (UNLIKELY(property.isEmpty() || property.isNull() || property.isPrivateName() || property.isSymbol()))
-                                continue;
+    auto sourceProvider = Zig::SourceProvider::create(jsCast<Zig::GlobalObject*>(globalObject), source, JSC::SourceProviderSourceType::Program);
+    bool ignoreESModuleAnnotation = source.tag == ResolvedSourceTagPackageJSONTypeModule;
+    SourceOrigin sourceOrigin = sourceProvider->sourceOrigin();
 
-                            // ignore constructor
-                            if (property == vm.propertyNames->constructor || property == vm.propertyNames->defaultKeyword)
-                                continue;
+    if (entry) {
+        moduleObject = jsDynamicCast<JSCommonJSModule*>(entry);
+    }
+
+    if (!moduleObject) {
+        auto& vm = globalObject->vm();
+        auto* requireMapKey = jsStringWithCache(vm, sourceURL);
+        auto index = sourceURL.reverseFind('/', sourceURL.length());
+        JSString* dirname = jsEmptyString(vm);
+        JSString* filename = requireMapKey;
+        if (index != WTF::notFound) {
+            dirname = JSC::jsSubstring(globalObject, requireMapKey, 0, index);
+        }
 
-                            JSC::PropertySlot slot(exports, PropertySlot::InternalMethodType::Get);
-                            if (!exports->getPropertySlot(globalObject, property, slot))
-                                continue;
+        JSC::SourceCode rawInputSource(
+            WTFMove(sourceProvider));
 
-                            exportNames.append(property);
+        moduleObject = JSCommonJSModule::create(
+            vm,
+            globalObject->CommonJSModuleObjectStructure(),
+            requireMapKey, filename, dirname, JSC::JSSourceCode::create(vm, WTFMove(rawInputSource)));
 
-                            JSValue getterResult = slot.getValue(globalObject, property);
+        moduleObject->putDirect(vm,
+            WebCore::clientData(vm)->builtinNames().exportsPublicName(),
+            JSC::constructEmptyObject(globalObject, globalObject->objectPrototype()), 0);
 
-                            // If it throws, we keep them in the exports list, but mark it as undefined
-                            // This is consistent with what Node.js does.
-                            if (catchScope.exception()) {
-                                catchScope.clearException();
-                                getterResult = jsUndefined();
-                            }
+        globalObject->requireMap()->set(globalObject, requireMapKey, moduleObject);
+    }
+
+    moduleObject->ignoreESModuleAnnotation = ignoreESModuleAnnotation;
+
+    return JSC::SourceCode(
+        JSC::SyntheticSourceProvider::create(
+            [](JSC::JSGlobalObject* lexicalGlobalObject,
+                JSC::Identifier moduleKey,
+                Vector<JSC::Identifier, 4>& exportNames,
+                JSC::MarkedArgumentBuffer& exportValues) -> void {
+                auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+                auto& vm = globalObject->vm();
 
-                            exportValues.append(getterResult);
+                JSValue keyValue = identifierToJSValue(vm, moduleKey);
+                JSValue entry = globalObject->requireMap()->get(globalObject, keyValue);
+
+                if (entry) {
+                    if (auto* moduleObject = jsDynamicCast<JSCommonJSModule*>(entry)) {
+                        if (!moduleObject->hasEvaluated) {
+                            WTF::NakedPtr<JSC::Exception> exception;
+                            if (!evaluateCommonJSModuleOnce(
+                                    vm,
+                                    globalObject,
+                                    moduleObject,
+                                    moduleObject->m_dirname.get(),
+                                    moduleObject->m_filename.get(), exception)) {
+
+                                // On error, remove the module from the require map
+                                // so that it can be re-evaluated on the next require.
+                                globalObject->requireMap()->remove(globalObject, moduleObject->id());
+
+                                auto scope = DECLARE_THROW_SCOPE(vm);
+                                throwException(globalObject, scope, exception.get());
+                                exception.clear();
+                                return;
+                            }
                         }
+
+                        moduleObject->toSyntheticSource(globalObject, moduleKey, exportNames, exportValues);
                     }
                 }
             },
-            SourceOrigin(WTF::URL::fileURLWithFileSystemPath(sourceURL)),
+            sourceOrigin,
             sourceURL));
 }
-
 }
 \ No newline at end of file
diff --git a/src/bun.js/bindings/CommonJSModuleRecord.h b/src/bun.js/bindings/CommonJSModuleRecord.h
index 86daf875d..15792f9da 100644
--- a/src/bun.js/bindings/CommonJSModuleRecord.h
+++ b/src/bun.js/bindings/CommonJSModuleRecord.h
@@ -6,14 +6,92 @@ class GlobalObject;
 }
 namespace JSC {
 class SourceCode;
+class JSSourceCode;
+class ProgramExecutable;
+class AbstractModuleRecord;
 }
 
 namespace Bun {
 
+JSC_DECLARE_HOST_FUNCTION(jsFunctionCreateCommonJSModule);
+JSC_DECLARE_HOST_FUNCTION(jsFunctionLoadModule);
+
+class JSCommonJSModule final : public JSC::JSDestructibleObject {
+public:
+    using Base = JSC::JSDestructibleObject;
+    static constexpr unsigned StructureFlags = Base::StructureFlags | JSC::OverridesPut;
+
+    mutable JSC::WriteBarrier<JSC::JSString> m_id;
+    mutable JSC::WriteBarrier<JSC::JSString> m_filename;
+    mutable JSC::WriteBarrier<JSC::JSString> m_dirname;
+    mutable JSC::WriteBarrier<Unknown> m_paths;
+    mutable JSC::WriteBarrier<JSC::JSSourceCode> sourceCode;
+    bool ignoreESModuleAnnotation { false };
+
+    static void destroy(JSC::JSCell*);
+    ~JSCommonJSModule();
+
+    void finishCreation(JSC::VM& vm,
+        JSC::JSString* id, JSC::JSString* filename,
+        JSC::JSString* dirname, JSC::JSSourceCode* sourceCode);
+
+    static JSC::Structure* createStructure(JSC::JSGlobalObject* globalObject);
+
+    bool evaluate(Zig::GlobalObject* globalObject, const WTF::String& sourceURL, ResolvedSource resolvedSource);
+    bool evaluate(Zig::GlobalObject* globalObject, const WTF::String& key, const SyntheticSourceProvider::SyntheticSourceGenerator& generator);
+
+    static JSCommonJSModule* create(JSC::VM& vm, JSC::Structure* structure,
+        JSC::JSString* id,
+        JSC::JSString* filename,
+        JSC::JSString* dirname, JSC::JSSourceCode* sourceCode);
+
+    static JSCommonJSModule* create(
+        Zig::GlobalObject* globalObject,
+        const WTF::String& key,
+        JSValue exportsObject,
+        bool hasEvaluated = false);
+
+    static JSCommonJSModule* create(
+        Zig::GlobalObject* globalObject,
+        const WTF::String& key,
+        ResolvedSource resolvedSource);
+
+    void toSyntheticSource(JSC::JSGlobalObject* globalObject,
+        JSC::Identifier moduleKey,
+        Vector<JSC::Identifier, 4>& exportNames,
+        JSC::MarkedArgumentBuffer& exportValues);
+
+    JSValue exportsObject();
+    JSValue id();
+
+    DECLARE_VISIT_CHILDREN;
+
+    static bool put(JSC::JSCell* cell, JSC::JSGlobalObject* globalObject,
+        JSC::PropertyName propertyName, JSC::JSValue value,
+        JSC::PutPropertySlot& slot);
+
+    DECLARE_INFO;
+    template<typename, SubspaceAccess mode>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm);
+
+    bool hasEvaluated = false;
+
+    JSCommonJSModule(JSC::VM& vm, JSC::Structure* structure)
+        : Base(vm, structure)
+    {
+    }
+};
+
+JSCommonJSModule* createCommonJSModuleWithoutRunning(
+    Zig::GlobalObject* globalObject,
+    Ref<Zig::SourceProvider> sourceProvider,
+    const WTF::String& sourceURL,
+    ResolvedSource source);
+
 JSC::Structure* createCommonJSModuleStructure(
     Zig::GlobalObject* globalObject);
 
-JSC::SourceCode createCommonJSModule(
+std::optional<JSC::SourceCode> createCommonJSModule(
     Zig::GlobalObject* globalObject,
     ResolvedSource source);
 
diff --git a/src/bun.js/bindings/FFI.zig b/src/bun.js/bindings/FFI.zig
index 087d8308c..fde4a8d30 100644
--- a/src/bun.js/bindings/FFI.zig
+++ b/src/bun.js/bindings/FFI.zig
@@ -42,7 +42,7 @@ pub inline fn JSVALUE_TO_UINT64(arg_value: EncodedJSValue) u64 {
         return @bitCast(u64, @as(c_longlong, JSVALUE_TO_INT32(value)));
     }
     if (JSVALUE_IS_NUMBER(value)) {
-        return @floatToInt(u64, JSVALUE_TO_DOUBLE(value));
+        return @intFromFloat(u64, JSVALUE_TO_DOUBLE(value));
     }
     return JSVALUE_TO_UINT64_SLOW(value);
 }
@@ -52,7 +52,7 @@ pub inline fn JSVALUE_TO_INT64(arg_value: EncodedJSValue) i64 {
         return @bitCast(i64, @as(c_longlong, JSVALUE_TO_INT32(value)));
     }
     if (JSVALUE_IS_NUMBER(value)) {
-        return @floatToInt(i64, JSVALUE_TO_DOUBLE(value));
+        return @intFromFloat(i64, JSVALUE_TO_DOUBLE(value));
     }
     return JSVALUE_TO_INT64_SLOW(value);
 }
@@ -67,7 +67,7 @@ pub inline fn UINT64_TO_JSVALUE(arg_globalObject: ?*anyopaque, arg_val: u64) Enc
         return INT32_TO_JSVALUE(@bitCast(i32, @truncate(c_uint, val)));
     }
     if (val < @bitCast(c_ulonglong, @as(c_longlong, @as(c_long, 9007199254740991)))) {
-        return DOUBLE_TO_JSVALUE(@intToFloat(f64, val));
+        return DOUBLE_TO_JSVALUE(@floatFromInt(f64, val));
     }
     return UINT64_TO_JSVALUE_SLOW(@ptrCast(*@import("./bindings.zig").JSGlobalObject, globalObject.?), val).asEncoded();
 }
@@ -78,7 +78,7 @@ pub inline fn INT64_TO_JSVALUE(arg_globalObject: ?*anyopaque, arg_val: i64) Enco
         return INT32_TO_JSVALUE(@bitCast(i32, @truncate(c_int, val)));
     }
     if ((val >= @bitCast(c_longlong, @as(c_longlong, -@as(c_long, 9007199254740991)))) and (val <= @bitCast(c_longlong, @as(c_longlong, @as(c_long, 9007199254740991))))) {
-        return DOUBLE_TO_JSVALUE(@intToFloat(f64, val));
+        return DOUBLE_TO_JSVALUE(@floatFromInt(f64, val));
     }
     return INT64_TO_JSVALUE_SLOW(@ptrCast(*@import("./bindings.zig").JSGlobalObject, globalObject.?), val).asEncoded();
 }
@@ -97,18 +97,18 @@ pub inline fn FLOAT_TO_JSVALUE(arg_val: f32) EncodedJSValue {
 pub inline fn BOOLEAN_TO_JSVALUE(arg_val: @"bool") EncodedJSValue {
     var val = arg_val;
     var res: EncodedJSValue = undefined;
-    res.asInt64 = @bitCast(i64, @as(c_longlong, if (@as(c_int, @boolToInt(val)) != 0) (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 1) else (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 0)));
+    res.asInt64 = @bitCast(i64, @as(c_longlong, if (@as(c_int, @intFromBool(val)) != 0) (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 1) else (@as(c_int, 2) | @as(c_int, 4)) | @as(c_int, 0)));
     return res;
 }
 pub inline fn PTR_TO_JSVALUE(arg_ptr: ?*anyopaque) EncodedJSValue {
     var ptr = arg_ptr;
     var val: EncodedJSValue = undefined;
-    val.asInt64 = @intCast(i64, @ptrToInt(ptr)) + (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49));
+    val.asInt64 = @intCast(i64, @intFromPtr(ptr)) + (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49));
     return val;
 }
 pub inline fn JSVALUE_TO_PTR(arg_val: EncodedJSValue) ?*anyopaque {
     var val = arg_val;
-    return @intToPtr(?*anyopaque, val.asInt64 - (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49)));
+    return @ptrFromInt(?*anyopaque, val.asInt64 - (@as(c_longlong, 1) << @intCast(@import("std").math.Log2Int(c_longlong), 49)));
 }
 pub inline fn JSVALUE_TO_INT32(arg_val: EncodedJSValue) i32 {
     var val = arg_val;
diff --git a/src/bun.js/bindings/ImportMetaObject.cpp b/src/bun.js/bindings/ImportMetaObject.cpp
index a53712823..037305c81 100644
--- a/src/bun.js/bindings/ImportMetaObject.cpp
+++ b/src/bun.js/bindings/ImportMetaObject.cpp
@@ -38,6 +38,9 @@
 #include "JSDOMURL.h"
 #include "JavaScriptCore/JSNativeStdFunction.h"
 #include "JavaScriptCore/GetterSetter.h"
+#include <JavaScriptCore/LazyProperty.h>
+#include <JavaScriptCore/LazyPropertyInlines.h>
+#include <JavaScriptCore/VMTrapsInlines.h>
 
 namespace Zig {
 using namespace JSC;
@@ -56,6 +59,7 @@ static EncodedJSValue functionRequireResolve(JSC::JSGlobalObject* globalObject,
         return JSC::JSValue::encode(JSC::JSValue {});
     }
     default: {
+        JSValue thisValue = callFrame->thisValue();
         JSC::JSValue moduleName = callFrame->argument(0);
 
         auto doIt = [&](const WTF::String& fromStr) -> JSC::EncodedJSValue {
@@ -83,10 +87,12 @@ static EncodedJSValue functionRequireResolve(JSC::JSGlobalObject* globalObject,
             // require.resolve also supports a paths array
             // we only support a single path
             if (!fromValue.isUndefinedOrNull() && fromValue.isObject()) {
-                if (JSValue pathsValue = fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s))) {
-                    if (JSC::JSArray* array = JSC::jsDynamicCast<JSC::JSArray*>(pathsValue)) {
-                        if (array->length() > 0) {
-                            fromValue = array->getIndex(globalObject, 0);
+                if (auto pathsObject = fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s))) {
+                    if (pathsObject.isCell() && pathsObject.asCell()->type() == JSC::JSType::ArrayType) {
+                        auto pathsArray = JSC::jsCast<JSC::JSArray*>(pathsObject);
+                        if (pathsArray->length() > 0) {
+                            fromValue = pathsArray->getIndex(globalObject, 0);
+                            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
                         }
                     }
                 }
@@ -123,216 +129,265 @@ Zig::ImportMetaObject* Zig::ImportMetaObject::create(JSC::JSGlobalObject* global
 }
 
 JSC_DECLARE_HOST_FUNCTION(jsFunctionRequireResolve);
+JSC_DEFINE_HOST_FUNCTION(jsFunctionRequireResolve, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    JSValue thisValue = callFrame->thisValue();
+    WTF::String fromStr;
 
-class JSRequireResolveFunctionPrototype final : public JSC::InternalFunction {
-public:
-    using Base = JSC::InternalFunction;
-
-    static JSRequireResolveFunctionPrototype* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
-    {
-        auto* structure = createStructure(vm, globalObject, globalObject->functionPrototype());
-        JSRequireResolveFunctionPrototype* function = new (NotNull, JSC::allocateCell<JSRequireResolveFunctionPrototype>(vm)) JSRequireResolveFunctionPrototype(vm, structure);
-        function->finishCreation(vm);
-        return function;
+    if (thisValue.isString()) {
+        fromStr = thisValue.toWTFString(globalObject);
     }
 
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
-    {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::InternalFunctionType, StructureFlags), info());
-    }
+    return functionRequireResolve(globalObject, callFrame, fromStr);
+}
 
-    DECLARE_INFO;
+JSC_DEFINE_CUSTOM_GETTER(jsRequireCacheGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    Zig::GlobalObject* thisObject = jsCast<Zig::GlobalObject*>(globalObject);
+    return JSValue::encode(thisObject->lazyRequireCacheObject());
+}
+
+JSC_DEFINE_CUSTOM_SETTER(jsRequireCacheSetter,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+{
+    JSObject* thisObject = jsDynamicCast<JSObject*>(JSValue::decode(thisValue));
+    if (!thisObject)
+        return false;
+
+    thisObject->putDirect(globalObject->vm(), propertyName, JSValue::decode(value), 0);
+    return true;
+}
 
-    static JSC::EncodedJSValue pathsFunction(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+JSC_DEFINE_HOST_FUNCTION(requireResolvePathsFunction, (JSGlobalObject * globalObject, CallFrame* callframe))
+{
+    return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr, 0));
+}
+
+static const HashTableValue RequireResolveFunctionPrototypeValues[] = {
+    { "paths"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, requireResolvePathsFunction, 1 } },
+};
+
+class RequireResolveFunctionPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+    static RequireResolveFunctionPrototype* create(
+        JSC::JSGlobalObject* globalObject)
     {
-        return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr));
+        auto& vm = globalObject->vm();
+
+        auto* structure = RequireResolveFunctionPrototype::createStructure(vm, globalObject, globalObject->functionPrototype());
+        RequireResolveFunctionPrototype* prototype = new (NotNull, JSC::allocateCell<RequireResolveFunctionPrototype>(vm)) RequireResolveFunctionPrototype(vm, structure);
+        prototype->finishCreation(vm);
+        return prototype;
     }
 
-private:
-    JSRequireResolveFunctionPrototype(JSC::VM& vm, JSC::Structure* structure)
-        : JSC::InternalFunction(vm, structure, jsFunctionRequireResolve, jsFunctionRequireResolve)
+    DECLARE_INFO;
 
+    RequireResolveFunctionPrototype(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : Base(vm, structure)
     {
     }
 
-    void finishCreation(JSC::VM& vm)
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
-        this->putDirectNativeFunction(vm, globalObject(), Identifier::fromString(vm, "paths"_s), 0, pathsFunction, ImplementationVisibility::Public, NoIntrinsic, 0);
-        Base::finishCreation(vm, 2, "resolve"_s, PropertyAdditionMode::WithoutStructureTransition);
+        return &vm.plainObjectSpace();
     }
 };
 
-const JSC::ClassInfo JSRequireResolveFunctionPrototype::s_info = { "Function"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSRequireResolveFunctionPrototype) };
+class ResolveFunction final : public JSC::InternalFunction {
 
-class JSRequireResolveFunction final : public JSC::InternalFunction {
 public:
     using Base = JSC::InternalFunction;
-
-    static JSRequireResolveFunction* create(JSC::VM& vm, JSC::Structure* structure, const WTF::String& from)
-    {
-        JSRequireResolveFunction* function = new (NotNull, JSC::allocateCell<JSRequireResolveFunction>(vm)) JSRequireResolveFunction(vm, structure, from);
-        function->finishCreation(vm);
-        return function;
-    }
-
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+    static ResolveFunction* create(JSGlobalObject* globalObject)
     {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::InternalFunctionType, StructureFlags), info());
+        JSObject* resolvePrototype = RequireResolveFunctionPrototype::create(globalObject);
+        Structure* structure = Structure::create(
+            globalObject->vm(),
+            globalObject,
+            resolvePrototype,
+            JSC::TypeInfo(JSC::InternalFunctionType, StructureFlags),
+            ResolveFunction::info());
+        auto* resolveFunction = new (NotNull, JSC::allocateCell<ResolveFunction>(globalObject->vm())) ResolveFunction(globalObject->vm(), structure);
+        resolveFunction->finishCreation(globalObject->vm());
+        return resolveFunction;
     }
 
     DECLARE_INFO;
 
-    WTF::String from;
-
-    template<typename, JSC::SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
-    {
-        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
-            return nullptr;
-
-        return WebCore::subspaceForImpl<JSRequireResolveFunction, UseCustomHeapCellType::No>(
-            vm,
-            [](auto& spaces) { return spaces.m_clientSubspaceForRequireResolveFunction.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForRequireResolveFunction = std::forward<decltype(space)>(space); },
-            [](auto& spaces) { return spaces.m_subspaceForRequireResolveFunction.get(); },
-            [](auto& spaces, auto&& space) { spaces.m_subspaceForRequireResolveFunction = std::forward<decltype(space)>(space); });
-    }
-
-private:
-    JSRequireResolveFunction(JSC::VM& vm, JSC::Structure* structure, const WTF::String& from_)
-        : JSC::InternalFunction(vm, structure, jsFunctionRequireResolve, jsFunctionRequireResolve)
-        , from(from_)
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
+        return &vm.internalFunctionSpace();
     }
 
-    void finishCreation(JSC::VM& vm)
+    ResolveFunction(
+        JSC::VM& vm,
+        JSC::Structure* structure)
+        : InternalFunction(vm, structure, jsFunctionRequireResolve, nullptr)
     {
-        Base::finishCreation(vm);
     }
 };
 
-const JSC::ClassInfo JSRequireResolveFunction::s_info = { "Function"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSRequireResolveFunction) };
-
-JSC_DEFINE_HOST_FUNCTION(jsFunctionRequireResolve, (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+JSObject* Zig::ImportMetaObject::createRequireResolveFunctionUnbound(VM& vm, JSGlobalObject* globalObject)
 {
-    JSRequireResolveFunction* thisObject = JSC::jsCast<JSRequireResolveFunction*>(callFrame->jsCallee());
-    return functionRequireResolve(globalObject, callFrame, thisObject->from);
+    return ResolveFunction::create(globalObject);
 }
 
-JSValue Zig::ImportMetaObject::createResolveFunctionPrototype(JSC::VM& vm, Zig::GlobalObject* globalObject)
+JSObject* Zig::ImportMetaObject::createRequireFunctionUnbound(VM& vm, JSGlobalObject* globalObject)
 {
-    return JSRequireResolveFunctionPrototype::create(vm, globalObject);
-}
+    auto& builtinNames = WebCore::builtinNames(vm);
 
-JSC::Structure* Zig::ImportMetaObject::createResolveFunctionStructure(JSC::VM& vm, Zig::GlobalObject* globalObject)
-{
-    JSValue prototype = globalObject->requireResolveFunctionPrototype();
-    return JSRequireResolveFunction::createStructure(vm, globalObject, prototype);
-}
+    JSC::JSFunction* requireDotMainFunction = JSFunction::create(
+        vm,
+        moduleMainCodeGenerator(vm),
+        globalObject->globalScope());
 
-JSC_DEFINE_CUSTOM_GETTER(jsRequireCacheGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
-{
-    Zig::GlobalObject* thisObject = jsCast<Zig::GlobalObject*>(globalObject);
-    return JSValue::encode(thisObject->lazyRequireCacheObject());
+    auto* prototype = JSC::constructEmptyObject(globalObject, globalObject->functionPrototype());
+    prototype->putDirect(
+        vm,
+        JSC::Identifier::fromString(vm, "main"_s),
+        JSC::GetterSetter::create(vm, globalObject, requireDotMainFunction, JSValue()),
+        PropertyAttribute::Builtin | PropertyAttribute::Accessor | PropertyAttribute::ReadOnly | 0);
+    prototype->putDirect(vm, JSC::Identifier::fromString(vm, "extensions"_s), constructEmptyObject(globalObject), 0);
+    prototype->putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "cache"_s), JSC::CustomGetterSetter::create(vm, Zig::jsRequireCacheGetter, Zig::jsRequireCacheSetter), 0);
+    return JSFunction::create(vm, importMetaObjectRequireCodeGenerator(vm), globalObject, JSFunction::createStructure(vm, globalObject, prototype));
 }
 
-JSC_DEFINE_CUSTOM_SETTER(jsRequireCacheSetter,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName propertyName))
+JSObject* Zig::ImportMetaObject::createRequireFunction(VM& vm, JSGlobalObject* lexicalGlobalObject, const WTF::String& pathString)
 {
-    JSObject* thisObject = jsDynamicCast<JSObject*>(JSValue::decode(thisValue));
-    if (!thisObject)
-        return false;
+    auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto& builtinNames = WebCore::builtinNames(vm);
 
-    thisObject->putDirect(globalObject->vm(), propertyName, JSValue::decode(value), 0);
-    return true;
-}
+    JSFunction* resolveFunctionUnbound = jsCast<JSFunction*>(globalObject->importMetaRequireResolveFunctionUnbound());
+    JSFunction* requireFunctionUnbound = jsCast<JSFunction*>(globalObject->importMetaRequireFunctionUnbound());
+    auto str = jsString(vm, pathString);
+
+    JSFunction* requireFunction = JSC::JSBoundFunction::create(vm,
+        globalObject, requireFunctionUnbound,
+        str, ArgList(), 1, jsString(vm, String("require"_s)));
+
+    JSFunction* resolveFunction = JSC::JSBoundFunction::create(vm,
+        globalObject, resolveFunctionUnbound,
+        str, ArgList(), 2, jsString(vm, String("resolve"_s)));
+
+    requireFunction->putDirect(vm, builtinNames.resolvePublicName(), resolveFunction, PropertyAttribute::Function | 0);
 
-JSObject* Zig::ImportMetaObject::createRequireFunction(VM& vm, JSGlobalObject* lexicalGlobalObject, const WTF::String& pathString)
-{
-    Zig::GlobalObject* globalObject = static_cast<Zig::GlobalObject*>(lexicalGlobalObject);
-    JSFunction* requireFunction = JSFunction::create(vm, importMetaObjectRequireCodeGenerator(vm), globalObject);
-    auto* resolveFunction = JSRequireResolveFunction::create(vm, globalObject->requireResolveFunctionStructure(), pathString);
-    auto clientData = WebCore::clientData(vm);
-    requireFunction->putDirect(vm, clientData->builtinNames().pathPublicName(), jsString(vm, pathString), PropertyAttribute::DontEnum | 0);
-    requireFunction->putDirect(vm, clientData->builtinNames().resolvePublicName(), resolveFunction, PropertyAttribute::Function | PropertyAttribute::DontDelete | 0);
-    requireFunction->putDirectCustomAccessor(vm, Identifier::fromString(vm, "cache"_s), JSC::CustomGetterSetter::create(vm, jsRequireCacheGetter, jsRequireCacheSetter), 0);
     return requireFunction;
 }
 
+const JSC::ClassInfo RequireResolveFunctionPrototype::s_info = { "resolve"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(RequireResolveFunctionPrototype) };
+const JSC::ClassInfo ResolveFunction::s_info = { "resolve"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(ResolveFunction) };
+
 extern "C" EncodedJSValue functionImportMeta__resolveSync(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
 {
     JSC::VM& vm = globalObject->vm();
     auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
 
-    switch (callFrame->argumentCount()) {
-    case 0: {
+    JSValue thisValue = callFrame->thisValue();
+    JSC::JSValue moduleName = callFrame->argument(0);
+    JSC::JSValue fromValue = callFrame->argument(1);
 
-        // not "requires" because "require" could be confusing
-        JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync needs 1 argument (a string)"_s);
+    if (moduleName.isUndefinedOrNull()) {
+        JSC::throwTypeError(globalObject, scope, "expects a string"_s);
         scope.release();
         return JSC::JSValue::encode(JSC::JSValue {});
     }
-    default: {
-        JSC::JSValue moduleName = callFrame->argument(0);
 
-        if (moduleName.isUndefinedOrNull()) {
-            JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync expects a string"_s);
-            scope.release();
-            return JSC::JSValue::encode(JSC::JSValue {});
-        }
+    JSC__JSValue from;
+    bool isESM = true;
 
-        JSC__JSValue from;
-        bool isESM = true;
+    if (callFrame->argumentCount() > 1) {
 
-        if (callFrame->argumentCount() > 1) {
-            JSC::JSValue fromValue = callFrame->argument(1);
+        if (callFrame->argumentCount() > 2) {
+            JSC::JSValue isESMValue = callFrame->argument(2);
+            if (isESMValue.isBoolean()) {
+                isESM = isESMValue.toBoolean(globalObject);
+                RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
+            }
+        }
 
-            // require.resolve also supports a paths array
-            // we only support a single path
-            if (!fromValue.isUndefinedOrNull() && fromValue.isObject()) {
-                if (JSC::JSArray* array = JSC::jsDynamicCast<JSC::JSArray*>(fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s)))) {
-                    if (array->length() > 0) {
-                        fromValue = array->getIndex(globalObject, 0);
-                    }
-                }
+        if (!fromValue.isUndefinedOrNull() && fromValue.isObject()) {
 
-                if (callFrame->argumentCount() > 2) {
-                    JSC::JSValue isESMValue = callFrame->argument(2);
-                    if (isESMValue.isBoolean()) {
-                        isESM = isESMValue.toBoolean(globalObject);
+            if (auto pathsObject = fromValue.getObject()->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "paths"_s))) {
+                if (pathsObject.isCell() && pathsObject.asCell()->type() == JSC::JSType::ArrayType) {
+                    auto pathsArray = JSC::jsCast<JSC::JSArray*>(pathsObject);
+                    if (pathsArray->length() > 0) {
+                        fromValue = pathsArray->getIndex(globalObject, 0);
                         RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
                     }
                 }
-            } else if (fromValue.isBoolean()) {
-                isESM = fromValue.toBoolean(globalObject);
-                RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
-            }
-            from = JSC::JSValue::encode(fromValue);
-
-        } else {
-            JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(callFrame->thisValue());
-            if (UNLIKELY(!thisObject)) {
-                auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-                JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync must be bound to an import.meta object"_s);
-                return JSC::JSValue::encode(JSC::JSValue {});
             }
 
-            auto clientData = WebCore::clientData(vm);
-
-            from = JSC::JSValue::encode(thisObject->get(globalObject, clientData->builtinNames().pathPublicName()));
+        } else if (fromValue.isBoolean()) {
+            isESM = fromValue.toBoolean(globalObject);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
+            fromValue = JSC::jsUndefined();
         }
 
-        auto result = Bun__resolveSync(globalObject, JSC::JSValue::encode(moduleName), from, isESM);
+        if (fromValue.isString()) {
+            from = JSC::JSValue::encode(fromValue);
+        } else if (thisValue.isString()) {
+            from = JSC::JSValue::encode(thisValue);
+        }
 
-        if (!JSC::JSValue::decode(result).isString()) {
-            JSC::throwException(globalObject, scope, JSC::JSValue::decode(result));
+    } else if (thisValue.isString()) {
+        from = JSC::JSValue::encode(thisValue);
+    } else {
+        JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(thisValue);
+        if (UNLIKELY(!thisObject)) {
+            auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+            JSC::throwTypeError(globalObject, scope, "import.meta.resolveSync must be bound to an import.meta object"_s);
             return JSC::JSValue::encode(JSC::JSValue {});
         }
 
+        auto clientData = WebCore::clientData(vm);
+        JSValue pathProperty = thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().pathPublicName());
+
+        if (pathProperty && pathProperty.isString())
+            from = JSC::JSValue::encode(pathProperty);
+    }
+
+    auto result = Bun__resolveSync(globalObject, JSC::JSValue::encode(moduleName), from, isESM);
+
+    if (!JSC::JSValue::decode(result).isString()) {
+        JSC::throwException(globalObject, scope, JSC::JSValue::decode(result));
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
+
+    scope.release();
+    return result;
+}
+
+extern "C" EncodedJSValue functionImportMeta__resolveSyncPrivate(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    JSC::VM& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+
+    JSC::JSValue moduleName = callFrame->argument(0);
+    JSValue from = callFrame->argument(1);
+    bool isESM = callFrame->argument(2).asBoolean();
+
+    if (moduleName.isUndefinedOrNull()) {
+        JSC::throwTypeError(globalObject, scope, "expected module name as a string"_s);
         scope.release();
-        return result;
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
+
+    RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSC::JSValue {}));
+
+    auto result = Bun__resolveSync(globalObject, JSC::JSValue::encode(moduleName), JSValue::encode(from), isESM);
+
+    if (!JSC::JSValue::decode(result).isString()) {
+        JSC::throwException(globalObject, scope, JSC::JSValue::decode(result));
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
+
+    scope.release();
+    return result;
 }
 
 JSC_DECLARE_HOST_FUNCTION(functionImportMeta__resolve);
@@ -362,7 +417,7 @@ JSC_DEFINE_HOST_FUNCTION(functionImportMeta__resolve,
 
         JSC__JSValue from;
 
-        if (callFrame->argumentCount() > 1) {
+        if (callFrame->argumentCount() > 1 && callFrame->argument(1).isString()) {
             from = JSC::JSValue::encode(callFrame->argument(1));
         } else {
             JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(callFrame->thisValue());
@@ -374,7 +429,7 @@ JSC_DEFINE_HOST_FUNCTION(functionImportMeta__resolve,
 
             auto clientData = WebCore::clientData(vm);
 
-            from = JSC::JSValue::encode(thisObject->get(globalObject, clientData->builtinNames().pathPublicName()));
+            from = JSC::JSValue::encode(thisObject->getIfPropertyExists(globalObject, clientData->builtinNames().pathPublicName()));
         }
 
         return Bun__resolve(globalObject, JSC::JSValue::encode(moduleName), from, true);
@@ -382,89 +437,244 @@ JSC_DEFINE_HOST_FUNCTION(functionImportMeta__resolve,
     }
 }
 
+enum class ImportMetaPropertyOffset : uint32_t {
+    url,
+    dir,
+    file,
+    path,
+    require,
+
+};
+static constexpr uint32_t numberOfImportMetaProperties = 5;
+
+Zig::ImportMetaObject* ImportMetaObject::create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, const WTF::String& url)
+{
+    ImportMetaObject* ptr = new (NotNull, JSC::allocateCell<ImportMetaObject>(vm)) ImportMetaObject(vm, structure, url);
+    ptr->finishCreation(vm);
+    return ptr;
+}
+Zig::ImportMetaObject* ImportMetaObject::create(JSC::JSGlobalObject* jslobalObject, JSC::JSString* keyString)
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(jslobalObject);
+    auto& vm = globalObject->vm();
+    auto view = keyString->value(globalObject);
+    JSC::Structure* structure = globalObject->ImportMetaObjectStructure();
+    return Zig::ImportMetaObject::create(vm, globalObject, structure, view);
+}
+
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_url, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->urlProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_dir, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->dirProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_file, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->fileProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_path, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->pathProperty.getInitializedOnMainThread(thisObject));
+}
+JSC_DEFINE_CUSTOM_GETTER(jsImportMetaObjectGetter_require, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName propertyName))
+{
+    ImportMetaObject* thisObject = jsDynamicCast<ImportMetaObject*>(JSValue::decode(thisValue));
+    if (UNLIKELY(!thisObject))
+        return JSValue::encode(jsUndefined());
+
+    return JSValue::encode(thisObject->requireProperty.getInitializedOnMainThread(thisObject));
+}
+
+static const HashTableValue ImportMetaObjectPrototypeValues[] = {
+    { "resolve"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, functionImportMeta__resolve, 0 } },
+    { "resolveSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, functionImportMeta__resolveSync, 0 } },
+    { "url"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_url, 0 } },
+    { "dir"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_dir, 0 } },
+    { "file"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_file, 0 } },
+    { "path"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_path, 0 } },
+    { "require"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, jsImportMetaObjectGetter_require, 0 } },
+};
+
 class ImportMetaObjectPrototype final : public JSC::JSNonFinalObject {
 public:
+    DECLARE_INFO;
     using Base = JSC::JSNonFinalObject;
 
-    static ImportMetaObjectPrototype* create(JSC::VM& vm, JSGlobalObject* globalObject, JSC::Structure* structure)
+    static Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
     {
-        ImportMetaObjectPrototype* ptr = new (NotNull, JSC::allocateCell<ImportMetaObjectPrototype>(vm)) ImportMetaObjectPrototype(vm, globalObject, structure);
-        ptr->finishCreation(vm, globalObject);
-        return ptr;
+        return Structure::create(vm, globalObject, globalObject->objectPrototype(), TypeInfo(ObjectType, StructureFlags), info());
+    }
+
+    static ImportMetaObjectPrototype* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+    {
+        ImportMetaObjectPrototype* prototype = new (NotNull, JSC::allocateCell<ImportMetaObjectPrototype>(vm)) ImportMetaObjectPrototype(vm, structure);
+        prototype->finishCreation(vm, globalObject);
+        return prototype;
     }
 
-    DECLARE_INFO;
     template<typename CellType, JSC::SubspaceAccess>
     static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
         return &vm.plainObjectSpace();
     }
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+
+    void finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
     {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
+        Base::finishCreation(vm);
+
+        auto* clientData = WebCore::clientData(vm);
+        auto& builtinNames = clientData->builtinNames();
+
+        reifyStaticProperties(vm, ImportMetaObject::info(), ImportMetaObjectPrototypeValues, *this);
+        JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+
+        this->putDirect(
+            vm,
+            builtinNames.mainPublicName(),
+            GetterSetter::create(vm, globalObject, JSFunction::create(vm, importMetaObjectMainCodeGenerator(vm), globalObject), nullptr),
+            JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Accessor | JSC::PropertyAttribute::Builtin | 0);
     }
 
-private:
-    ImportMetaObjectPrototype(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+    ImportMetaObjectPrototype(JSC::VM& vm, JSC::Structure* structure)
         : Base(vm, structure)
     {
     }
+};
+
+const ClassInfo ImportMetaObjectPrototype::s_info = {
+    "ImportMeta"_s,
 
-    void finishCreation(JSC::VM&, JSC::JSGlobalObject*);
+    Base::info(), nullptr, nullptr, CREATE_METHOD_TABLE(ImportMetaObjectPrototype)
 };
-STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(ImportMetaObjectPrototype, ImportMetaObjectPrototype::Base);
 
-JSObject* ImportMetaObject::createPrototype(VM& vm, JSDOMGlobalObject& globalObject)
+JSC::Structure* ImportMetaObject::createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
 {
-    return ImportMetaObjectPrototype::create(vm, &globalObject, ImportMetaObjectPrototype::createStructure(vm, &globalObject, globalObject.objectPrototype()));
-}
+    ImportMetaObjectPrototype* prototype = ImportMetaObjectPrototype::create(vm,
+        globalObject,
+        ImportMetaObjectPrototype::createStructure(vm, globalObject));
 
-void ImportMetaObjectPrototype::finishCreation(VM& vm, JSGlobalObject* globalObject_)
-{
-    Base::finishCreation(vm);
-    auto* globalObject = reinterpret_cast<Zig::GlobalObject*>(globalObject_);
     auto clientData = WebCore::clientData(vm);
-
     auto& builtinNames = clientData->builtinNames();
 
-    this->putDirect(vm, builtinNames.filePublicName(), jsEmptyString(vm), 0);
-    this->putDirect(vm, builtinNames.dirPublicName(), jsEmptyString(vm), 0);
-    this->putDirect(vm, builtinNames.pathPublicName(), jsEmptyString(vm), 0);
-    this->putDirect(vm, builtinNames.urlPublicName(), jsEmptyString(vm), 0);
-
-    this->putDirect(
-        vm,
-        builtinNames.mainPublicName(),
-        GetterSetter::create(vm, globalObject, JSFunction::create(vm, importMetaObjectMainCodeGenerator(vm), globalObject), nullptr),
-        JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Accessor | JSC::PropertyAttribute::Builtin | 0);
-
-    this->putDirect(vm, Identifier::fromString(vm, "primordials"_s), jsUndefined(), JSC::PropertyAttribute::DontEnum | 0);
-
-    String requireString = "[[require]]"_s;
-    this->putDirect(vm, builtinNames.requirePublicName(), Zig::ImportMetaObject::createRequireFunction(vm, globalObject, requireString), PropertyAttribute::Builtin | PropertyAttribute::Function | 0);
-
-    this->putDirectNativeFunction(vm, globalObject, builtinNames.resolvePublicName(), 1,
-        functionImportMeta__resolve,
-        ImplementationVisibility::Public,
-        NoIntrinsic,
-        JSC::PropertyAttribute::Function | 0);
-    this->putDirectNativeFunction(
-        vm, globalObject, builtinNames.resolveSyncPublicName(),
-        1,
-        functionImportMeta__resolveSync,
-        ImplementationVisibility::Public,
-        NoIntrinsic,
-        JSC::PropertyAttribute::Function | 0);
-
-    JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+    return Structure::create(vm, globalObject, prototype, TypeInfo(ObjectType, StructureFlags), ImportMetaObject::info());
 }
 
 void ImportMetaObject::finishCreation(VM& vm)
 {
     Base::finishCreation(vm);
     ASSERT(inherits(info()));
+
+    this->requireProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSFunction>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView path;
+        if (url.protocolIs("file"_s)) {
+            path = url.fileSystemPath();
+        } else {
+            path = url.path();
+        }
+
+        JSFunction* value = jsCast<JSFunction*>(ImportMetaObject::createRequireFunction(init.vm, meta->globalObject(), path.toString()));
+        init.set(value);
+    });
+    this->urlProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+
+        init.set(jsString(init.vm, url.string()));
+    });
+    this->dirProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView dirname;
+
+        if (url.protocolIs("file"_s)) {
+            dirname = url.fileSystemPath();
+        } else {
+            dirname = url.path();
+        }
+
+        if (dirname.endsWith("/"_s)) {
+            dirname = dirname.substring(0, dirname.length() - 1);
+        } else if (dirname.contains('/')) {
+            dirname = dirname.substring(0, dirname.reverseFind('/'));
+        }
+
+        init.set(jsString(init.vm, dirname.toString()));
+    });
+    this->fileProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView path;
+        if (url.protocolIs("file"_s)) {
+            path = url.fileSystemPath();
+        } else {
+            path = url.path();
+        }
+
+        WTF::StringView filename;
+
+        if (path.endsWith("/"_s)) {
+            filename = path.substring(path.reverseFind('/', path.length() - 2) + 1);
+        } else {
+            filename = path.substring(path.reverseFind('/') + 1);
+        }
+
+        init.set(jsString(init.vm, filename.toString()));
+    });
+    this->pathProperty.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::JSString>::Initializer& init) {
+        ImportMetaObject* meta = jsCast<ImportMetaObject*>(init.owner);
+
+        WTF::URL url = meta->url.startsWith('/') ? WTF::URL::fileURLWithFileSystemPath(meta->url) : WTF::URL(meta->url);
+        WTF::StringView path;
+
+        if (url.protocolIs("file"_s)) {
+            path = url.fileSystemPath();
+        } else {
+            path = url.path();
+        }
+
+        init.set(jsString(init.vm, path.toString()));
+    });
+}
+
+template<typename Visitor>
+void ImportMetaObject::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    ImportMetaObject* fn = jsCast<ImportMetaObject*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(fn, info());
+    Base::visitChildren(fn, visitor);
+
+    fn->requireProperty.visit(visitor);
+    fn->urlProperty.visit(visitor);
+    fn->dirProperty.visit(visitor);
+    fn->fileProperty.visit(visitor);
+    fn->pathProperty.visit(visitor);
 }
 
+DEFINE_VISIT_CHILDREN(ImportMetaObject);
+
 void ImportMetaObject::analyzeHeap(JSCell* cell, HeapAnalyzer& analyzer)
 {
     auto* thisObject = jsCast<ImportMetaObject*>(cell);
@@ -475,9 +685,6 @@ void ImportMetaObject::analyzeHeap(JSCell* cell, HeapAnalyzer& analyzer)
     Base::analyzeHeap(cell, analyzer);
 }
 
-const JSC::ClassInfo ImportMetaObjectPrototype::s_info = { "ImportMeta"_s, &Base::s_info, nullptr, nullptr,
-    CREATE_METHOD_TABLE(ImportMetaObjectPrototype) };
-
 const JSC::ClassInfo ImportMetaObject::s_info = { "ImportMeta"_s, &Base::s_info, nullptr, nullptr,
     CREATE_METHOD_TABLE(ImportMetaObject) };
 }
diff --git a/src/bun.js/bindings/ImportMetaObject.h b/src/bun.js/bindings/ImportMetaObject.h
index d0f8f0963..6b5661039 100644
--- a/src/bun.js/bindings/ImportMetaObject.h
+++ b/src/bun.js/bindings/ImportMetaObject.h
@@ -9,6 +9,7 @@
 #include "JSDOMWrapperCache.h"
 
 extern "C" JSC_DECLARE_HOST_FUNCTION(functionImportMeta__resolveSync);
+extern "C" JSC_DECLARE_HOST_FUNCTION(functionImportMeta__resolveSyncPrivate);
 extern "C" JSC::EncodedJSValue Bun__resolve(JSC::JSGlobalObject* global, JSC::EncodedJSValue specifier, JSC::EncodedJSValue from, bool is_esm);
 extern "C" JSC::EncodedJSValue Bun__resolveSync(JSC::JSGlobalObject* global, JSC::EncodedJSValue specifier, JSC::EncodedJSValue from, bool is_esm);
 extern "C" JSC::EncodedJSValue Bun__resolveSyncWithSource(JSC::JSGlobalObject* global, JSC::EncodedJSValue specifier, BunString* from, bool is_esm);
@@ -18,97 +19,56 @@ namespace Zig {
 using namespace JSC;
 using namespace WebCore;
 
-class ImportMetaObject final : public JSC::JSDestructibleObject {
+JSC_DECLARE_CUSTOM_GETTER(jsRequireCacheGetter);
+JSC_DECLARE_CUSTOM_SETTER(jsRequireCacheSetter);
+
+class ImportMetaObject final : public JSC::JSNonFinalObject {
 public:
-    using Base = JSC::JSDestructibleObject;
+    using Base = JSC::JSNonFinalObject;
 
-    static ImportMetaObject* create(JSC::VM& vm, JSGlobalObject* globalObject, JSC::Structure* structure)
-    {
-        ImportMetaObject* ptr = new (NotNull, JSC::allocateCell<ImportMetaObject>(vm)) ImportMetaObject(vm, globalObject, structure);
-        ptr->finishCreation(vm);
-        return ptr;
-    }
+    static ImportMetaObject* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, const WTF::String& url);
 
-    static JSC::Structure* createResolveFunctionStructure(JSC::VM& vm, Zig::GlobalObject* globalObject);
-    static JSValue createResolveFunctionPrototype(JSC::VM& vm, Zig::GlobalObject* globalObject);
+    static JSC::JSObject* createRequireFunctionUnbound(JSC::VM& vm, JSGlobalObject* globalObject);
+    static JSC::JSObject* createRequireResolveFunctionUnbound(JSC::VM& vm, JSGlobalObject* globalObject);
     static JSObject* createRequireFunction(VM& vm, JSGlobalObject* lexicalGlobalObject, const WTF::String& pathString);
 
-    static ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSC::JSValue key);
-
-    static inline Zig::ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSC::JSString* keyString)
-    {
-        // TODO: optimize this by reusing the same JSC::Structure object and using putDirectOffset
-        auto& vm = globalObject->vm();
-        auto view = keyString->value(globalObject);
-        JSC::Structure* structure = WebCore::getDOMStructure<Zig::ImportMetaObject>(vm, *reinterpret_cast<Zig::GlobalObject*>(globalObject));
-        Zig::ImportMetaObject* metaProperties = Zig::ImportMetaObject::create(vm, globalObject, structure);
-        if (UNLIKELY(!metaProperties)) {
-            return nullptr;
-        }
-
-        auto clientData = WebCore::clientData(vm);
-        auto& builtinNames = clientData->builtinNames();
-
-        auto index = view.reverseFind('/', view.length());
-        if (index != WTF::notFound) {
-            metaProperties->putDirect(vm, builtinNames.dirPublicName(),
-                JSC::jsSubstring(globalObject, keyString, 0, index));
-            metaProperties->putDirect(
-                vm, builtinNames.filePublicName(),
-                JSC::jsSubstring(globalObject, keyString, index + 1, view.length() - index - 1));
-        } else {
-            metaProperties->putDirect(vm, builtinNames.filePublicName(), keyString);
-        }
-        metaProperties->putDirect(
-            vm,
-            builtinNames.pathPublicName(),
-            keyString,
-            0);
-
-        metaProperties->putDirect(
-            vm,
-            builtinNames.requirePublicName(),
-            Zig::ImportMetaObject::createRequireFunction(vm, globalObject, view),
-            PropertyAttribute::Builtin | PropertyAttribute::Function | 0);
-
-        if (view.startsWith('/')) {
-            metaProperties->putDirect(vm, builtinNames.urlPublicName(), JSC::JSValue(JSC::jsString(vm, WTF::URL::fileURLWithFileSystemPath(view).string())));
-        } else {
-            if (view.startsWith("node:"_s) || view.startsWith("bun:"_s)) {
-                metaProperties->putDirect(globalObject->vm(), JSC::Identifier::fromString(globalObject->vm(), "primordials"_s), reinterpret_cast<Zig::GlobalObject*>(globalObject)->primordialsObject());
-            }
-            metaProperties->putDirect(vm, builtinNames.urlPublicName(), keyString);
-        }
-
-        return metaProperties;
-    }
+    static ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSC::JSString* keyString);
+    static ImportMetaObject* create(JSC::JSGlobalObject* globalObject, JSValue keyString);
 
     DECLARE_INFO;
+    DECLARE_VISIT_CHILDREN;
 
-    static constexpr bool needsDestruction = true;
-
-    template<typename CellType, SubspaceAccess>
-    static CompleteSubspace* subspaceFor(VM& vm)
+    template<typename, JSC::SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
     {
-        return &vm.destructibleObjectSpace();
-    }
+        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+            return nullptr;
 
-    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
-    {
-        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
+        return WebCore::subspaceForImpl<ImportMetaObject, UseCustomHeapCellType::No>(
+            vm,
+            [](auto& spaces) { return spaces.m_clientSubspaceForImportMeta.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForImportMeta = std::forward<decltype(space)>(space); },
+            [](auto& spaces) { return spaces.m_subspaceForImportMeta.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_subspaceForImportMeta = std::forward<decltype(space)>(space); });
     }
 
-    static JSObject* createPrototype(VM& vm, JSDOMGlobalObject& globalObject);
+    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject);
     static void analyzeHeap(JSCell*, JSC::HeapAnalyzer&);
 
+    WTF::String url;
+    LazyProperty<JSObject, JSFunction> requireProperty;
+    LazyProperty<JSObject, JSString> dirProperty;
+    LazyProperty<JSObject, JSString> urlProperty;
+    LazyProperty<JSObject, JSString> fileProperty;
+    LazyProperty<JSObject, JSString> pathProperty;
+
 private:
-    ImportMetaObject(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+    ImportMetaObject(JSC::VM& vm, JSC::Structure* structure, const WTF::String& url)
         : Base(vm, structure)
+        , url(url)
     {
     }
 
     void finishCreation(JSC::VM&);
 };
-STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(ImportMetaObject, ImportMetaObject::Base);
 
 }
 \ No newline at end of file
diff --git a/src/bun.js/bindings/JSBuffer.cpp b/src/bun.js/bindings/JSBuffer.cpp
index 00965da89..e420e24ef 100644
--- a/src/bun.js/bindings/JSBuffer.cpp
+++ b/src/bun.js/bindings/JSBuffer.cpp
@@ -1436,43 +1436,71 @@ static inline JSC::EncodedJSValue jsBufferPrototypeFunction_toStringBody(JSC::JS
     if (length == 0)
         return JSC::JSValue::encode(JSC::jsEmptyString(vm));
 
-    switch (callFrame->argumentCount()) {
-    case 0: {
-        break;
-    }
-    case 2:
-    case 3:
-    case 1: {
-        EnsureStillAliveScope arg1 = callFrame->uncheckedArgument(0);
-        if (!arg1.value().isUndefined()) {
-            encoding = parseEncoding(lexicalGlobalObject, scope, arg1.value());
+    size_t argsCount = callFrame->argumentCount();
+
+    JSC::JSValue arg1 = callFrame->argument(0);
+    JSC::JSValue arg2 = callFrame->argument(1);
+    JSC::JSValue arg3 = callFrame->argument(2);
+
+    // This method could be called in following forms:
+    // - toString()
+    // - toString(encoding)
+    // - toString(encoding, start)
+    // - toString(encoding, start, end)
+    // - toString(offset, length)
+    // - toString(offset, length, encoding)
+    if (argsCount == 0)
+        return jsBufferToString(vm, lexicalGlobalObject, castedThis, offset, length, encoding);
+
+    if (arg1.isString()) {
+        encoding = parseEncoding(lexicalGlobalObject, scope, arg1);
+        RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+
+        if (!arg3.isUndefined()) {
+            // length is end
+            length = std::min(byteLength, static_cast<uint32_t>(arg3.toInt32(lexicalGlobalObject)));
             RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
         }
-        if (callFrame->argumentCount() == 1)
-            break;
-    }
-    // any
-    case 5: {
-        JSC::JSValue arg2 = callFrame->uncheckedArgument(1);
-        int32_t ioffset = arg2.toInt32(lexicalGlobalObject);
+
+        int32_t istart = 0;
+
+        if (!arg2.isUndefined()) {
+            istart = arg2.toInt32(lexicalGlobalObject);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+        }
+
+        if (istart < 0) {
+            throwTypeError(lexicalGlobalObject, scope, "Start must be a positive integer"_s);
+            return JSC::JSValue::encode(jsUndefined());
+        }
+        offset = static_cast<uint32_t>(istart);
+        length = (length > offset) ? (length - offset) : 0;
+    } else {
+
+        int32_t ioffset = 0;
+
+        if (!arg1.isUndefined()) {
+            ioffset = arg1.toInt32(lexicalGlobalObject);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+        }
+
         if (ioffset < 0) {
             throwTypeError(lexicalGlobalObject, scope, "Offset must be a positive integer"_s);
             return JSC::JSValue::encode(jsUndefined());
         }
+
         offset = static_cast<uint32_t>(ioffset);
+        length = (length > offset) ? (length - offset) : 0;
 
-        if (callFrame->argumentCount() == 2)
-            break;
-    }
+        if (!arg3.isUndefined()) {
+            encoding = parseEncoding(lexicalGlobalObject, scope, arg3);
+            RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(jsUndefined()));
+        }
 
-    default: {
-        length = std::min(byteLength, static_cast<uint32_t>(callFrame->argument(2).toInt32(lexicalGlobalObject)));
-        break;
-    }
+        if (!arg2.isUndefined())
+            length = std::min(length, static_cast<uint32_t>(arg2.toInt32(lexicalGlobalObject)));
     }
 
-    length -= std::min(offset, length);
-
     return jsBufferToString(vm, lexicalGlobalObject, castedThis, offset, length, encoding);
 }
 
@@ -1662,14 +1690,6 @@ JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_compare, (JSGlobalObject *
     return jsBufferConstructorFunction_compareBody(lexicalGlobalObject, callFrame);
 }
 
-JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_isBuffer, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
-{
-    if (callFrame->argumentCount() < 1)
-        return JSC::JSValue::encode(JSC::jsBoolean(false));
-
-    return JSC::JSValue::encode(JSC::jsBoolean(JSBuffer__isBuffer(lexicalGlobalObject, JSC::JSValue::encode(callFrame->uncheckedArgument(0)))));
-}
-
 JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_concat, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     return jsBufferConstructorFunction_concatBody(lexicalGlobalObject, callFrame);
@@ -1678,24 +1698,6 @@ JSC_DEFINE_HOST_FUNCTION(jsBufferConstructorFunction_concat, (JSGlobalObject * l
 extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorAllocWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int size));
 extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorAllocUnsafeWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int size));
 extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorAllocUnsafeSlowWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int size));
-extern "C" JSC_DECLARE_JIT_OPERATION_WITHOUT_WTF_INTERNAL(jsBufferConstructorIsBufferWithoutTypeChecks, JSValue, (JSC::JSGlobalObject * lexicalGlobalObject, void*, JSUint8Array* value));
-
-static bool isBufferWithCell(JSC::JSGlobalObject* lexicalGlobalObject, JSC::JSUint8Array* cell)
-{
-    auto& vm = lexicalGlobalObject->vm();
-    JSValue prototype = cell->getPrototype(vm, lexicalGlobalObject);
-    return prototype.inherits<JSBufferPrototype>();
-}
-
-JSC_DEFINE_JIT_OPERATION(jsBufferConstructorIsBufferWithoutTypeChecks, JSValue, (JSC::JSGlobalObject * lexicalGlobalObject, void* ctx, JSUint8Array* thisValue))
-{
-    VM& vm = JSC::getVM(lexicalGlobalObject);
-    IGNORE_WARNINGS_BEGIN("frame-address")
-    CallFrame* callFrame = DECLARE_CALL_FRAME(vm);
-    IGNORE_WARNINGS_END
-    JSC::JITOperationPrologueCallFrameTracer tracer(vm, callFrame);
-    return jsBoolean(isBufferWithCell(lexicalGlobalObject, thisValue));
-}
 
 JSC_DEFINE_JIT_OPERATION(jsBufferConstructorAllocWithoutTypeChecks, JSUint8Array*, (JSC::JSGlobalObject * lexicalGlobalObject, void* thisValue, int byteLength))
 {
@@ -1849,11 +1851,17 @@ static const HashTableValue JSBufferPrototypeTableValues[]
           { "readUInt8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt8CodeGenerator, 1 } },
           { "readUIntBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntBECodeGenerator, 1 } },
           { "readUIntLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntLECodeGenerator, 1 } },
+          // name alias
+          { "readUintBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntBECodeGenerator, 1 } },
+          { "readUintLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUIntLECodeGenerator, 1 } },
+          { "readUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt8CodeGenerator, 1 } },
           { "readUint16BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt16BECodeGenerator, 1 } },
           { "readUint16LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt16LECodeGenerator, 1 } },
           { "readUint32BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt32BECodeGenerator, 1 } },
           { "readUint32LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt32LECodeGenerator, 1 } },
-          { "readUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadUInt8CodeGenerator, 1 } },
+          { "readBigUint64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadBigUInt64BECodeGenerator, 1 } },
+          { "readBigUint64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeReadBigUInt64LECodeGenerator, 1 } },
+
           { "slice"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeSliceCodeGenerator, 2 } },
           { "subarray"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeSliceCodeGenerator, 2 } },
           { "swap16"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferPrototypeFunction_swap16, 0 } },
@@ -1873,8 +1881,6 @@ static const HashTableValue JSBufferPrototypeTableValues[]
           { "writeBigInt64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigInt64LECodeGenerator, 1 } },
           { "writeBigUInt64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64BECodeGenerator, 1 } },
           { "writeBigUInt64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64LECodeGenerator, 1 } },
-          { "writeBigUint64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64BECodeGenerator, 1 } },
-          { "writeBigUint64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64LECodeGenerator, 1 } },
           { "writeDouble"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteDoubleLECodeGenerator, 1 } },
           { "writeDoubleBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteDoubleBECodeGenerator, 1 } },
           { "writeDoubleLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteDoubleLECodeGenerator, 1 } },
@@ -1897,13 +1903,18 @@ static const HashTableValue JSBufferPrototypeTableValues[]
           { "writeUInt8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt8CodeGenerator, 1 } },
           { "writeUIntBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntBECodeGenerator, 1 } },
           { "writeUIntLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntLECodeGenerator, 1 } },
+          // name alias
+          { "writeUintBE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntBECodeGenerator, 1 } },
+          { "writeUintLE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUIntLECodeGenerator, 1 } },
+          { "writeUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt8CodeGenerator, 1 } },
           { "writeUint16"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt16LECodeGenerator, 1 } },
           { "writeUint16BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt16BECodeGenerator, 1 } },
           { "writeUint16LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt16LECodeGenerator, 1 } },
           { "writeUint32"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt32LECodeGenerator, 1 } },
           { "writeUint32BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt32BECodeGenerator, 1 } },
           { "writeUint32LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt32LECodeGenerator, 1 } },
-          { "writeUint8"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteUInt8CodeGenerator, 1 } },
+          { "writeBigUint64BE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64BECodeGenerator, 1 } },
+          { "writeBigUint64LE"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferPrototypeWriteBigUInt64LECodeGenerator, 1 } },
       };
 
 void JSBufferPrototype::finishCreation(VM& vm, JSC::JSGlobalObject* globalThis)
@@ -1928,11 +1939,6 @@ static const JSC::DOMJIT::Signature DOMJITSignaturejsBufferConstructorAlloc(jsBu
     JSC::DOMJIT::Effect::forWriteKinds(JSC::DFG::AbstractHeapKind::Heap),
     JSC::SpecUint8Array, JSC::SpecInt32Only);
 
-static const JSC::DOMJIT::Signature DOMJITSignaturejsBufferConstructorIsBuffer(jsBufferConstructorIsBufferWithoutTypeChecks,
-    JSBufferConstructor::info(),
-    JSC::DOMJIT::Effect::forPure(),
-    JSC::SpecOther, JSC::SpecUint8Array);
-
 static const JSC::DOMJIT::Signature DOMJITSignaturejsBufferConstructorAllocUnsafe(jsBufferConstructorAllocUnsafeWithoutTypeChecks,
     JSBufferConstructor::info(),
     JSC::DOMJIT::Effect::forWriteKinds(JSC::DFG::AbstractHeapKind::Heap),
@@ -1954,7 +1960,7 @@ static const HashTableValue JSBufferConstructorTableValues[] = {
     { "compare"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_compare, 2 } },
     { "concat"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_concat, 2 } },
     { "from"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferConstructorFromCodeGenerator, 1 } },
-    { "isBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DOMJITFunction), NoIntrinsic, { HashTableValue::DOMJITFunctionType, jsBufferConstructorFunction_isBuffer, &DOMJITSignaturejsBufferConstructorIsBuffer } },
+    { "isBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Builtin), NoIntrinsic, { HashTableValue::BuiltinGeneratorType, jsBufferConstructorIsBufferCodeGenerator, 1 } },
     { "toBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_toBuffer, 1 } },
     { "isEncoding"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsBufferConstructorFunction_isEncoding, 1 } },
 };
diff --git a/src/bun.js/bindings/JSBundlerPlugin.cpp b/src/bun.js/bindings/JSBundlerPlugin.cpp
index cae6a4b22..ec3933574 100644
--- a/src/bun.js/bindings/JSBundlerPlugin.cpp
+++ b/src/bun.js/bindings/JSBundlerPlugin.cpp
@@ -54,7 +54,7 @@ void BundlerPlugin::NamespaceList::append(JSC::VM& vm, JSC::RegExp* filter, Stri
     nsGroup->append(WTFMove(regex));
 }
 
-bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespaceStr, const ZigString* path, bool isOnLoad)
+bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const BunString* namespaceStr, const BunString* path, bool isOnLoad)
 {
     constexpr bool usesPatternContextBuffer = false;
     if (isOnLoad) {
@@ -62,7 +62,7 @@ bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespac
             return false;
 
         // Avoid unnecessary string copies
-        auto namespaceString = namespaceStr ? Zig::toString(*namespaceStr) : String();
+        auto namespaceString = namespaceStr ? Bun::toWTFString(*namespaceStr) : String();
 
         auto* group = this->onLoad.group(namespaceString);
         if (group == nullptr) {
@@ -70,7 +70,7 @@ bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespac
         }
 
         auto& filters = *group;
-        auto pathString = Zig::toString(*path);
+        auto pathString = Bun::toWTFString(*path);
 
         for (auto& filter : filters) {
             Yarr::MatchingContextHolder regExpContext(vm, usesPatternContextBuffer, nullptr, Yarr::MatchFrom::CompilerThread);
@@ -84,14 +84,14 @@ bool BundlerPlugin::anyMatchesCrossThread(JSC::VM& vm, const ZigString* namespac
             return false;
 
         // Avoid unnecessary string copies
-        auto namespaceString = namespaceStr ? Zig::toString(*namespaceStr) : String();
+        auto namespaceString = namespaceStr ? Bun::toWTFString(*namespaceStr) : String();
 
         auto* group = this->onResolve.group(namespaceString);
         if (group == nullptr) {
             return false;
         }
 
-        auto pathString = Zig::toString(*path);
+        auto pathString = Bun::toWTFString(*path);
         auto& filters = *group;
 
         for (auto& filter : filters) {
@@ -115,9 +115,19 @@ static const HashTableValue JSBundlerPluginHashTable[] = {
 class JSBundlerPlugin final : public JSC::JSNonFinalObject {
 public:
     using Base = JSC::JSNonFinalObject;
-    static JSBundlerPlugin* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, void* config, BunPluginTarget target)
+    static JSBundlerPlugin* create(JSC::VM& vm,
+        JSC::JSGlobalObject* globalObject,
+        JSC::Structure* structure,
+        void* config,
+        BunPluginTarget target,
+        JSBundlerPluginAddErrorCallback addError = JSBundlerPlugin__addError,
+        JSBundlerPluginOnLoadAsyncCallback onLoadAsync = JSBundlerPlugin__onLoadAsync,
+        JSBundlerPluginOnResolveAsyncCallback onResolveAsync = JSBundlerPlugin__onResolveAsync)
     {
-        JSBundlerPlugin* ptr = new (NotNull, JSC::allocateCell<JSBundlerPlugin>(vm)) JSBundlerPlugin(vm, globalObject, structure, config, target);
+        JSBundlerPlugin* ptr = new (NotNull, JSC::allocateCell<JSBundlerPlugin>(vm)) JSBundlerPlugin(vm, globalObject, structure, config, target,
+            addError,
+            onLoadAsync,
+            onResolveAsync);
         ptr->finishCreation(vm);
         return ptr;
     }
@@ -147,9 +157,10 @@ public:
     JSC::LazyProperty<JSBundlerPlugin, JSC::JSFunction> setupFunction;
 
 private:
-    JSBundlerPlugin(JSC::VM& vm, JSC::JSGlobalObject*, JSC::Structure* structure, void* config, BunPluginTarget target)
+    JSBundlerPlugin(JSC::VM& vm, JSC::JSGlobalObject*, JSC::Structure* structure, void* config, BunPluginTarget target,
+        JSBundlerPluginAddErrorCallback addError, JSBundlerPluginOnLoadAsyncCallback onLoadAsync, JSBundlerPluginOnResolveAsyncCallback onResolveAsync)
         : JSC::JSNonFinalObject(vm, structure)
-        , plugin(BundlerPlugin(config, target))
+        , plugin(BundlerPlugin(config, target, addError, onLoadAsync, onResolveAsync))
     {
     }
 
@@ -199,7 +210,7 @@ JSC_DEFINE_HOST_FUNCTION(jsBundlerPluginFunction_addError, (JSC::JSGlobalObject
 {
     JSBundlerPlugin* thisObject = jsCast<JSBundlerPlugin*>(callFrame->thisValue());
     if (!thisObject->plugin.tombstoned) {
-        JSBundlerPlugin__addError(
+        thisObject->plugin.addError(
             UNWRAP_BUNDLER_PLUGIN(callFrame),
             thisObject->plugin.config,
             JSValue::encode(callFrame->argument(1)),
@@ -212,7 +223,7 @@ JSC_DEFINE_HOST_FUNCTION(jsBundlerPluginFunction_onLoadAsync, (JSC::JSGlobalObje
 {
     JSBundlerPlugin* thisObject = jsCast<JSBundlerPlugin*>(callFrame->thisValue());
     if (!thisObject->plugin.tombstoned) {
-        JSBundlerPlugin__onLoadAsync(
+        thisObject->plugin.onLoadAsync(
             UNWRAP_BUNDLER_PLUGIN(callFrame),
             thisObject->plugin.config,
             JSValue::encode(callFrame->argument(1)),
@@ -225,7 +236,7 @@ JSC_DEFINE_HOST_FUNCTION(jsBundlerPluginFunction_onResolveAsync, (JSC::JSGlobalO
 {
     JSBundlerPlugin* thisObject = jsCast<JSBundlerPlugin*>(callFrame->thisValue());
     if (!thisObject->plugin.tombstoned) {
-        JSBundlerPlugin__onResolveAsync(
+        thisObject->plugin.onResolveAsync(
             UNWRAP_BUNDLER_PLUGIN(callFrame),
             thisObject->plugin.config,
             JSValue::encode(callFrame->argument(1)),
@@ -274,15 +285,15 @@ void JSBundlerPlugin::finishCreation(JSC::VM& vm)
     reifyStaticProperties(vm, JSBundlerPlugin::info(), JSBundlerPluginHashTable, *this);
 }
 
-extern "C" bool JSBundlerPlugin__anyMatches(Bun::JSBundlerPlugin* pluginObject, const ZigString* namespaceString, const ZigString* path, bool isOnLoad)
+extern "C" bool JSBundlerPlugin__anyMatches(Bun::JSBundlerPlugin* pluginObject, const BunString* namespaceString, const BunString* path, bool isOnLoad)
 {
     return pluginObject->plugin.anyMatchesCrossThread(pluginObject->vm(), namespaceString, path, isOnLoad);
 }
 
-extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const ZigString* namespaceString, const ZigString* path, void* context, uint8_t defaultLoaderId)
+extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const BunString* namespaceString, const BunString* path, void* context, uint8_t defaultLoaderId)
 {
-    WTF::String namespaceStringStr = namespaceString ? Zig::toStringCopy(*namespaceString) : WTF::String();
-    WTF::String pathStr = path ? Zig::toStringCopy(*path) : WTF::String();
+    WTF::String namespaceStringStr = namespaceString ? Bun::toWTFString(*namespaceString) : WTF::String();
+    WTF::String pathStr = path ? Bun::toWTFString(*path) : WTF::String();
 
     JSFunction* function = plugin->onLoadFunction.get(plugin);
     if (UNLIKELY(!function))
@@ -306,7 +317,7 @@ extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject,
         auto exception = scope.exception();
         scope.clearException();
         if (!plugin->plugin.tombstoned) {
-            JSBundlerPlugin__addError(
+            plugin->plugin.addError(
                 context,
                 plugin->plugin.config,
                 JSC::JSValue::encode(exception),
@@ -315,14 +326,14 @@ extern "C" void JSBundlerPlugin__matchOnLoad(JSC::JSGlobalObject* globalObject,
     }
 }
 
-extern "C" void JSBundlerPlugin__matchOnResolve(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const ZigString* namespaceString, const ZigString* path, const ZigString* importer, void* context, uint8_t kindId)
+extern "C" void JSBundlerPlugin__matchOnResolve(JSC::JSGlobalObject* globalObject, Bun::JSBundlerPlugin* plugin, const BunString* namespaceString, const BunString* path, const BunString* importer, void* context, uint8_t kindId)
 {
-    WTF::String namespaceStringStr = namespaceString ? Zig::toStringCopy(*namespaceString) : WTF::String("file"_s);
+    WTF::String namespaceStringStr = namespaceString ? Bun::toWTFString(*namespaceString) : WTF::String("file"_s);
     if (namespaceStringStr.length() == 0) {
         namespaceStringStr = WTF::String("file"_s);
     }
-    WTF::String pathStr = path ? Zig::toStringCopy(*path) : WTF::String();
-    WTF::String importerStr = importer ? Zig::toStringCopy(*importer) : WTF::String();
+    WTF::String pathStr = path ? Bun::toWTFString(*path) : WTF::String();
+    WTF::String importerStr = importer ? Bun::toWTFString(*importer) : WTF::String();
     auto& vm = globalObject->vm();
 
     JSFunction* function = plugin->onResolveFunction.get(plugin);
diff --git a/src/bun.js/bindings/JSBundlerPlugin.h b/src/bun.js/bindings/JSBundlerPlugin.h
index 08aa1d140..4d82cdc1b 100644
--- a/src/bun.js/bindings/JSBundlerPlugin.h
+++ b/src/bun.js/bindings/JSBundlerPlugin.h
@@ -9,6 +9,10 @@
 #include <JavaScriptCore/Yarr.h>
 #include <JavaScriptCore/Strong.h>
 
+typedef void (*JSBundlerPluginAddErrorCallback)(void*, void*, JSC::EncodedJSValue, JSC::EncodedJSValue);
+typedef void (*JSBundlerPluginOnLoadAsyncCallback)(void*, void*, JSC::EncodedJSValue, JSC::EncodedJSValue);
+typedef void (*JSBundlerPluginOnResolveAsyncCallback)(void*, void*, JSC::EncodedJSValue, JSC::EncodedJSValue, JSC::EncodedJSValue);
+
 namespace Bun {
 
 using namespace JSC;
@@ -42,10 +46,13 @@ public:
     };
 
 public:
-    bool anyMatchesCrossThread(JSC::VM&, const ZigString* namespaceStr, const ZigString* path, bool isOnLoad);
+    bool anyMatchesCrossThread(JSC::VM&, const BunString* namespaceStr, const BunString* path, bool isOnLoad);
     void tombstone() { tombstoned = true; }
 
-    BundlerPlugin(void* config, BunPluginTarget target)
+    BundlerPlugin(void* config, BunPluginTarget target, JSBundlerPluginAddErrorCallback addError, JSBundlerPluginOnLoadAsyncCallback onLoadAsync, JSBundlerPluginOnResolveAsyncCallback onResolveAsync)
+        : addError(addError)
+        , onLoadAsync(onLoadAsync)
+        , onResolveAsync(onResolveAsync)
     {
         this->target = target;
         this->config = config;
@@ -54,6 +61,10 @@ public:
     NamespaceList onLoad = {};
     NamespaceList onResolve = {};
     BunPluginTarget target { BunPluginTargetBrowser };
+
+    JSBundlerPluginAddErrorCallback addError;
+    JSBundlerPluginOnLoadAsyncCallback onLoadAsync;
+    JSBundlerPluginOnResolveAsyncCallback onResolveAsync;
     void* config { nullptr };
     bool tombstoned { false };
 };
diff --git a/src/bun.js/bindings/JSEnvironmentVariableMap.cpp b/src/bun.js/bindings/JSEnvironmentVariableMap.cpp
index 5c0357066..4989f7e96 100644
--- a/src/bun.js/bindings/JSEnvironmentVariableMap.cpp
+++ b/src/bun.js/bindings/JSEnvironmentVariableMap.cpp
@@ -30,7 +30,7 @@ JSC_DEFINE_CUSTOM_GETTER(jsGetterEnvironmentVariable, (JSGlobalObject * globalOb
     if (UNLIKELY(name.len == 0))
         return JSValue::encode(jsUndefined());
 
-    if (!Bun__getEnvValue(globalObject, &name, &value) || value.len == 0) {
+    if (!Bun__getEnvValue(globalObject, &name, &value)) {
         return JSValue::encode(jsUndefined());
     }
 
@@ -144,4 +144,4 @@ JSValue createEnvironmentVariablesMap(Zig::GlobalObject* globalObject)
 
     return object;
 }
-}
-\ No newline at end of file
+}
diff --git a/src/bun.js/bindings/JSMockFunction.cpp b/src/bun.js/bindings/JSMockFunction.cpp
index b7c2659b4..3a84f0139 100644
--- a/src/bun.js/bindings/JSMockFunction.cpp
+++ b/src/bun.js/bindings/JSMockFunction.cpp
@@ -19,6 +19,7 @@
 #include <JavaScriptCore/WeakMapImpl.h>
 #include <JavaScriptCore/WeakMapImplInlines.h>
 #include <JavaScriptCore/FunctionPrototype.h>
+#include <JavaScriptCore/DateInstance.h>
 
 namespace Bun {
 
@@ -65,6 +66,41 @@ JSC_DECLARE_HOST_FUNCTION(jsMockFunctionMockRejectedValueOnce);
 JSC_DECLARE_HOST_FUNCTION(jsMockFunctionWithImplementationCleanup);
 JSC_DECLARE_HOST_FUNCTION(jsMockFunctionWithImplementation);
 
+// This is a stub. Exists so that the same code can be run in Jest
+extern "C" EncodedJSValue JSMock__jsUseFakeTimers(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    return JSValue::encode(callFrame->thisValue());
+}
+
+extern "C" EncodedJSValue JSMock__jsUseRealTimers(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    globalObject->overridenDateNow = -1;
+    return JSValue::encode(callFrame->thisValue());
+}
+
+extern "C" EncodedJSValue JSMock__jsNow(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    return JSValue::encode(jsNumber(globalObject->jsDateNow()));
+}
+extern "C" EncodedJSValue JSMock__jsSetSystemTime(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callFrame)
+{
+    JSValue argument0 = callFrame->argument(0);
+
+    if (auto* dateInstance = jsDynamicCast<DateInstance*>(argument0)) {
+        if (std::isnormal(dateInstance->internalNumber())) {
+            globalObject->overridenDateNow = dateInstance->internalNumber();
+        }
+        return JSValue::encode(callFrame->thisValue());
+    }
+
+    if (argument0.isNumber() && argument0.asNumber() > 0) {
+        globalObject->overridenDateNow = argument0.asNumber();
+    }
+
+    globalObject->overridenDateNow = -1;
+    return JSValue::encode(callFrame->thisValue());
+}
+
 uint64_t JSMockModule::s_nextInvocationId = 0;
 
 // This is taken from JSWeakSet
@@ -391,6 +427,7 @@ void JSMockFunction::visitChildrenImpl(JSCell* cell, Visitor& visitor)
     visitor.append(fn->instances);
     visitor.append(fn->returnValues);
     visitor.append(fn->invocationCallOrder);
+    visitor.append(fn->spyOriginal);
     fn->mock.visit(visitor);
 }
 DEFINE_VISIT_CHILDREN(JSMockFunction);
@@ -526,13 +563,13 @@ extern "C" void JSMock__resetSpies(Zig::GlobalObject* globalObject)
     globalObject->mockModule.activeSpies.clear();
 }
 
-extern "C" EncodedJSValue jsFunctionResetSpies(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callframe)
+extern "C" EncodedJSValue JSMock__jsRestoreAllMocks(JSC::JSGlobalObject* globalObject, JSC::CallFrame* callframe)
 {
     JSMock__resetSpies(jsCast<Zig::GlobalObject*>(globalObject));
     return JSValue::encode(jsUndefined());
 }
 
-extern "C" EncodedJSValue JSMock__spyOn(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callframe)
+extern "C" EncodedJSValue JSMock__jsSpyOn(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callframe)
 {
     auto& vm = lexicalGlobalObject->vm();
     auto scope = DECLARE_THROW_SCOPE(vm);
@@ -568,15 +605,19 @@ extern "C" EncodedJSValue JSMock__spyOn(JSC::JSGlobalObject* lexicalGlobalObject
 
     // easymode: regular property or missing property
     if (!hasValue || slot.isValue()) {
+        JSValue value = jsUndefined();
+        if (hasValue) {
+            value = slot.getValue(globalObject, propertyKey);
+            if (jsDynamicCast<JSMockFunction*>(value)) {
+                return JSValue::encode(value);
+            }
+        }
+
         auto* mock = JSMockFunction::create(vm, globalObject, globalObject->mockModule.mockFunctionStructure.getInitializedOnMainThread(globalObject), CallbackKind::GetterSetter);
         mock->spyTarget = JSC::Weak<JSObject>(object, &weakValueHandleOwner(), nullptr);
         mock->spyIdentifier = propertyKey.isSymbol() ? Identifier::fromUid(vm, propertyKey.uid()) : Identifier::fromString(vm, propertyKey.publicName());
         mock->spyAttributes = hasValue ? slot.attributes() : 0;
         unsigned attributes = 0;
-        JSValue value = jsUndefined();
-
-        if (hasValue)
-            value = slot.getValue(globalObject, propertyKey);
 
         if (hasValue && ((slot.attributes() & PropertyAttribute::Function) != 0 || (value.isCell() && value.isCallable()))) {
             if (hasValue)
@@ -963,7 +1004,7 @@ JSC_DEFINE_CUSTOM_GETTER(jsMockFunctionGetter_protoImpl, (JSC::JSGlobalObject *
     return JSValue::encode(jsUndefined());
 }
 
-JSC_DEFINE_HOST_FUNCTION(jsMockFunctionConstructor, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callframe))
+extern "C" EncodedJSValue JSMock__jsMockFn(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callframe)
 {
     auto& vm = lexicalGlobalObject->vm();
     auto* globalObject = jsCast<Zig::GlobalObject*>(lexicalGlobalObject);
@@ -997,11 +1038,6 @@ JSC_DEFINE_HOST_FUNCTION(jsMockFunctionConstructor, (JSC::JSGlobalObject * lexic
     return JSValue::encode(thisObject);
 }
 
-extern "C" EncodedJSValue JSMockFunction__createObject(Zig::GlobalObject* globalObject)
-{
-    auto& vm = globalObject->vm();
-    return JSValue::encode(JSC::JSFunction::create(vm, globalObject, 0, "mock"_s, jsMockFunctionConstructor, ImplementationVisibility::Public));
-}
 extern "C" EncodedJSValue JSMockFunction__getCalls(EncodedJSValue encodedValue)
 {
     JSValue value = JSValue::decode(encodedValue);
diff --git a/src/bun.js/bindings/JSReadableHelper.h b/src/bun.js/bindings/JSReadableHelper.h
index 6746bcbec..3e2554c2b 100644
--- a/src/bun.js/bindings/JSReadableHelper.h
+++ b/src/bun.js/bindings/JSReadableHelper.h
@@ -8,7 +8,6 @@ JSC_DECLARE_HOST_FUNCTION(jsReadable_maybeReadMore);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_resume);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_emitReadable);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_onEofChunk);
-JSC_DECLARE_HOST_FUNCTION(jsReadable_resume_);
 JSC_DECLARE_HOST_FUNCTION(jsReadable_emitReadable_);
 
 } // namespace WebCore
diff --git a/src/bun.js/bindings/JSReadableState.cpp b/src/bun.js/bindings/JSReadableState.cpp
index d09e30d44..1f3a36def 100644
--- a/src/bun.js/bindings/JSReadableState.cpp
+++ b/src/bun.js/bindings/JSReadableState.cpp
@@ -26,10 +26,10 @@ int64_t getHighWaterMark(JSC::VM& vm, JSC::JSGlobalObject* globalObject, bool is
     auto* clientData = WebCore::clientData(vm);
     if (JSValue highWaterMarkVal = options->getIfPropertyExists(globalObject, clientData->builtinNames().highWaterMarkPublicName())) {
         if (isDuplex && (highWaterMarkVal.isUndefined() || highWaterMarkVal.isNull())) {
-            highWaterMarkVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
+            highWaterMarkVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
         }
 
-        if (!highWaterMarkVal.isUndefinedOrNull()) {
+        if (highWaterMarkVal && highWaterMarkVal.isNumber()) {
             return highWaterMarkVal.toInt32(globalObject);
         }
     }
@@ -42,9 +42,9 @@ void JSReadableState::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj
     Base::finishCreation(vm);
 
     if (options != nullptr) {
-        JSC::JSValue objectModeVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "objectMode"_s));
+        JSC::JSValue objectModeVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "objectMode"_s));
         if (isDuplex && !objectModeVal) {
-            objectModeVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
+            objectModeVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "readableObjectMode"_s));
         }
         if (objectModeVal && objectModeVal.toBoolean(globalObject))
             setBool(JSReadableState::Mask::objectMode, true);
@@ -65,13 +65,16 @@ void JSReadableState::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj
     m_pipes.set(vm, this, JSC::constructEmptyArray(globalObject, nullptr, 0));
 
     if (options != nullptr) {
-        JSC::JSValue emitCloseVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "emitClose"_s));
-        if (!emitCloseVal.isBoolean() || emitCloseVal.toBoolean(globalObject))
+        JSC::JSValue emitCloseVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "emitClose"_s));
+        if (!emitCloseVal || emitCloseVal.toBoolean(globalObject))
             setBool(JSReadableState::Mask::emitClose, true);
         // Has it been destroyed.
-        JSC::JSValue autoDestroyVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "autoDestroy"_s));
-        if (!autoDestroyVal.isBoolean() || autoDestroyVal.toBoolean(globalObject))
+        JSC::JSValue autoDestroyVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "autoDestroy"_s));
+        if (!autoDestroyVal || autoDestroyVal.toBoolean(globalObject))
             setBool(JSReadableState::Mask::autoDestroy, true);
+    } else {
+        setBool(JSReadableState::Mask::emitClose, true);
+        setBool(JSReadableState::Mask::autoDestroy, true);
     }
 
     // Indicates whether the stream has finished destroying.
@@ -90,26 +93,25 @@ void JSReadableState::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObj
     }
 
     m_awaitDrainWriters.set(vm, this, JSC::jsNull());
+    JSValue decodeValue = JSC::jsNull();
+    JSValue encodingValue = JSC::jsNull();
 
-    if (options == nullptr) {
-        m_decoder.set(vm, this, JSC::jsNull());
-        m_encoding.set(vm, this, JSC::jsNull());
-    } else {
-        JSC::JSValue encodingVal = options->getDirect(vm, JSC::Identifier::fromString(vm, "encoding"_s));
+    if (options != nullptr) {
+        JSC::JSValue encodingVal = options->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "encoding"_s));
         if (encodingVal && encodingVal.isString()) {
             auto constructor = reinterpret_cast<Zig::GlobalObject*>(globalObject)->JSStringDecoder();
             auto constructData = JSC::getConstructData(constructor);
             MarkedArgumentBuffer args;
             args.append(encodingVal);
             JSObject* decoder = JSC::construct(globalObject, constructor, constructData, args);
-            m_decoder.set(vm, this, decoder);
-            m_encoding.set(vm, this, encodingVal);
-        } else {
-            m_decoder.set(vm, this, JSC::jsNull());
-            m_encoding.set(vm, this, JSC::jsNull());
+            decodeValue = decoder;
+            encodingValue = encodingVal;
         }
     }
 
+    m_decoder.set(vm, this, decodeValue);
+    m_encoding.set(vm, this, encodingValue);
+
     // ReadableState.constructed is set to false during construction when a _construct method is implemented
     // this is here so that the ReadableState behavior tracks the behavior in node, and that calling Readable.read
     // will work when we return early from construct because there is no Readable._construct implemented
@@ -403,10 +405,12 @@ JSC::EncodedJSValue JSReadableStateConstructor::construct(JSC::JSGlobalObject* l
         return JSValue::encode(jsUndefined());
     }
     isDuplex = isDuplexVal.toBoolean(lexicalGlobalObject);
+    RETURN_IF_EXCEPTION(throwScope, encodedJSValue());
     JSObject* options = nullptr;
-    if (optionsVal.toBoolean(lexicalGlobalObject) && optionsVal.isObject()) {
+    if (optionsVal && optionsVal.isObject()) {
         options = optionsVal.toObject(lexicalGlobalObject);
     }
+    RETURN_IF_EXCEPTION(throwScope, encodedJSValue());
 
     JSReadableState* stringDecoder = JSReadableState::create(
         vm, lexicalGlobalObject, reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject)->JSReadableStateStructure(), isDuplex, options);
diff --git a/src/bun.js/bindings/JSSink.cpp b/src/bun.js/bindings/JSSink.cpp
index 36be334dd..ed2554dc7 100644
--- a/src/bun.js/bindings/JSSink.cpp
+++ b/src/bun.js/bindings/JSSink.cpp
@@ -1,6 +1,6 @@
 
 // AUTO-GENERATED FILE. DO NOT EDIT.
-// Generated by 'make generate-sink' at 2023-05-18T01:04:00.447Z
+// Generated by 'make generate-sink' at 2023-07-06T14:22:07.346Z
 // To regenerate this file, run:
 //
 //   make generate-sink
diff --git a/src/bun.js/bindings/JSSink.h b/src/bun.js/bindings/JSSink.h
index 5bbfab777..386554ebb 100644
--- a/src/bun.js/bindings/JSSink.h
+++ b/src/bun.js/bindings/JSSink.h
@@ -1,6 +1,6 @@
 
 // AUTO-GENERATED FILE. DO NOT EDIT.
-// Generated by 'make generate-sink' at 2023-05-18T01:04:00.446Z
+// Generated by 'make generate-sink' at 2023-07-06T14:22:07.345Z
 //
 #pragma once
 
diff --git a/src/bun.js/bindings/JSSinkLookupTable.h b/src/bun.js/bindings/JSSinkLookupTable.h
index a4ace6dc3..e4ed81629 100644
--- a/src/bun.js/bindings/JSSinkLookupTable.h
+++ b/src/bun.js/bindings/JSSinkLookupTable.h
@@ -1,4 +1,4 @@
-// Automatically generated from src/bun.js/bindings/JSSink.cpp using /Users/jarred/Code/bun/src/bun.js/WebKit/Source/JavaScriptCore/create_hash_table. DO NOT EDIT!
+// Automatically generated from src/bun.js/bindings/JSSink.cpp using /home/cirospaciari/Repos/bun/src/bun.js/WebKit/Source/JavaScriptCore/create_hash_table. DO NOT EDIT!
 
 
 
diff --git a/src/bun.js/bindings/JSStringDecoder.cpp b/src/bun.js/bindings/JSStringDecoder.cpp
index 5ec258522..b8c2dd50c 100644
--- a/src/bun.js/bindings/JSStringDecoder.cpp
+++ b/src/bun.js/bindings/JSStringDecoder.cpp
@@ -129,7 +129,7 @@ uint8_t JSStringDecoder::utf8CheckIncomplete(uint8_t* bufPtr, uint32_t length, u
             m_lastNeed = nb - 1;
         return nb;
     }
-    if (--j < i || nb == -2)
+    if (j == 0 || --j < i || nb == -2)
         return 0;
     nb = utf8CheckByte(bufPtr[j]);
     if (nb >= 0) {
@@ -137,7 +137,7 @@ uint8_t JSStringDecoder::utf8CheckIncomplete(uint8_t* bufPtr, uint32_t length, u
             m_lastNeed = nb - 2;
         return nb;
     }
-    if (--j < i || nb == -2)
+    if (j == 0 || --j < i || nb == -2)
         return 0;
     nb = utf8CheckByte(bufPtr[j]);
     if (nb >= 0) {
diff --git a/src/bun.js/bindings/ModuleLoader.cpp b/src/bun.js/bindings/ModuleLoader.cpp
index ed1e5702b..0ccbb7dbb 100644
--- a/src/bun.js/bindings/ModuleLoader.cpp
+++ b/src/bun.js/bindings/ModuleLoader.cpp
@@ -36,6 +36,11 @@
 #include "../modules/TTYModule.h"
 #include "node_util_types.h"
 #include "CommonJSModuleRecord.h"
+#include <JavaScriptCore/JSModuleLoader.h>
+#include <JavaScriptCore/Completion.h>
+#include <JavaScriptCore/JSModuleNamespaceObject.h>
+#include <JavaScriptCore/JSMap.h>
+#include <JavaScriptCore/JSMapInlines.h>
 
 namespace Bun {
 using namespace Zig;
@@ -350,6 +355,110 @@ extern "C" void Bun__onFulfillAsyncModule(
     promise->resolve(promise->globalObject(), JSC::JSSourceCode::create(vm, JSC::SourceCode(provider)));
 }
 
+JSValue fetchCommonJSModule(
+    Zig::GlobalObject* globalObject,
+    JSCommonJSModule* target,
+    JSValue specifierValue,
+    BunString* specifier,
+    BunString* referrer)
+{
+    void* bunVM = globalObject->bunVM();
+    auto& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    ErrorableResolvedSource resValue;
+    ErrorableResolvedSource* res = &resValue;
+
+    auto& builtinNames = WebCore::clientData(vm)->builtinNames();
+
+    if (Bun__fetchBuiltinModule(bunVM, globalObject, specifier, referrer, res)) {
+        if (!res->success) {
+            throwException(scope, res->result.err, globalObject);
+            return JSValue();
+        }
+
+        switch (res->result.value.tag) {
+        case SyntheticModuleType::Module: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateNodeModuleModule);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+
+        case SyntheticModuleType::Buffer: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateBufferSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::TTY: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateTTYSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::NodeUtilTypes: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), Bun::generateNodeUtilTypesSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::Process: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateProcessSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::Events: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateEventsSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        case SyntheticModuleType::StringDecoder: {
+            target->evaluate(globalObject, Bun::toWTFString(*specifier), generateStringDecoderSourceCode);
+            RETURN_IF_EXCEPTION(scope, {});
+            RELEASE_AND_RETURN(scope, target);
+        }
+        default: {
+            RELEASE_AND_RETURN(scope, jsNumber(-1));
+        }
+        }
+    }
+
+    // if (JSC::JSValue virtualModuleResult = JSValue::decode(Bun__runVirtualModule(globalObject, specifier))) {
+    //     return handleVirtualModuleResult<allowPromise>(globalObject, virtualModuleResult, res, specifier, referrer);
+    // }
+    auto* loader = globalObject->moduleLoader();
+    JSMap* registry = jsCast<JSMap*>(loader->getDirect(vm, Identifier::fromString(vm, "registry"_s)));
+
+    auto hasAlreadyLoadedESMVersionSoWeShouldntTranspileItTwice = [&]() -> bool {
+        JSValue entry = registry->get(globalObject, specifierValue);
+
+        if (!entry || !entry.isObject()) {
+            return false;
+        }
+
+        int status = entry.getObject()->getDirect(vm, WebCore::clientData(vm)->builtinNames().statePublicName()).asInt32();
+        return status > JSModuleLoader::Status::Fetch;
+    };
+
+    if (hasAlreadyLoadedESMVersionSoWeShouldntTranspileItTwice()) {
+        RELEASE_AND_RETURN(scope, jsNumber(-1));
+    }
+
+    Bun__transpileFile(bunVM, globalObject, specifier, referrer, res, false);
+
+    if (res->success && res->result.value.commonJSExportsLen) {
+        target->evaluate(globalObject, Bun::toWTFString(*specifier).isolatedCopy(), res->result.value);
+        RETURN_IF_EXCEPTION(scope, {});
+        RELEASE_AND_RETURN(scope, target);
+    }
+
+    if (!res->success) {
+        throwException(scope, res->result.err, globalObject);
+        RELEASE_AND_RETURN(scope, {});
+    }
+
+    auto&& provider = Zig::SourceProvider::create(globalObject, res->result.value);
+    globalObject->moduleLoader()->provideFetch(globalObject, specifierValue, JSC::SourceCode(provider));
+    RETURN_IF_EXCEPTION(scope, {});
+    RELEASE_AND_RETURN(scope, jsNumber(-1));
+}
+
 template<bool allowPromise>
 static JSValue fetchSourceCode(
     Zig::GlobalObject* globalObject,
@@ -382,6 +491,11 @@ static JSValue fetchSourceCode(
 
     auto rejectOrResolve = [&](JSValue code) -> JSValue {
         if (auto* exception = scope.exception()) {
+            if constexpr (!allowPromise) {
+                scope.release();
+                return {};
+            }
+
             scope.clearException();
             return rejectedInternalPromise(globalObject, exception);
         }
@@ -457,7 +571,7 @@ static JSValue fetchSourceCode(
             return rejectOrResolve(JSSourceCode::create(vm, WTFMove(source)));
         }
         default: {
-            auto&& provider = Zig::SourceProvider::create(globalObject, res->result.value);
+            auto&& provider = Zig::SourceProvider::create(globalObject, res->result.value, JSC::SourceProviderSourceType::Module, true);
             return rejectOrResolve(JSC::JSSourceCode::create(vm, JSC::SourceCode(provider)));
         }
         }
@@ -477,8 +591,19 @@ static JSValue fetchSourceCode(
     }
 
     if (res->success && res->result.value.commonJSExportsLen) {
-        auto source = Bun::createCommonJSModule(globalObject, res->result.value);
-        return rejectOrResolve(JSSourceCode::create(vm, WTFMove(source)));
+        auto created = Bun::createCommonJSModule(globalObject, res->result.value);
+
+        if (created.has_value()) {
+            return rejectOrResolve(JSSourceCode::create(vm, WTFMove(created.value())));
+        }
+
+        if constexpr (allowPromise) {
+            auto* exception = scope.exception();
+            scope.clearException();
+            return rejectedInternalPromise(globalObject, exception);
+        } else {
+            return JSC::jsUndefined();
+        }
     }
 
     if (!res->success) {
diff --git a/src/bun.js/bindings/ModuleLoader.h b/src/bun.js/bindings/ModuleLoader.h
index 0deaeff08..6eb04bf40 100644
--- a/src/bun.js/bindings/ModuleLoader.h
+++ b/src/bun.js/bindings/ModuleLoader.h
@@ -15,6 +15,8 @@ class JSInternalPromise;
 namespace Bun {
 using namespace JSC;
 
+class JSCommonJSModule;
+
 typedef uint8_t OnLoadResultType;
 const OnLoadResultType OnLoadResultTypeError = 0;
 const OnLoadResultType OnLoadResultTypeCode = 1;
@@ -91,4 +93,11 @@ JSValue fetchSourceCodeAsync(
     BunString* specifier,
     BunString* referrer);
 
+JSValue fetchCommonJSModule(
+    Zig::GlobalObject* globalObject,
+    JSCommonJSModule* moduleObject,
+    JSValue specifierValue,
+    BunString* specifier,
+    BunString* referrer);
+
 } // namespace Bun
 \ No newline at end of file
diff --git a/src/bun.js/bindings/Process.cpp b/src/bun.js/bindings/Process.cpp
index 69ee11e60..745be0e47 100644
--- a/src/bun.js/bindings/Process.cpp
+++ b/src/bun.js/bindings/Process.cpp
@@ -10,13 +10,41 @@
 #include "ImportMetaObject.h"
 #include <sys/stat.h>
 #include "ZigConsoleClient.h"
+#include <JavaScriptCore/GetterSetter.h>
+#include <JavaScriptCore/JSSet.h>
+#include <JavaScriptCore/LazyProperty.h>
+#include <JavaScriptCore/LazyPropertyInlines.h>
+#include <JavaScriptCore/VMTrapsInlines.h>
+
 #pragma mark - Node.js Process
 
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#endif
+
+#if defined(__linux__)
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#endif
+
+#if !defined(_MSC_VER)
+#include <unistd.h> // setuid, getuid
+#endif
+
 namespace Zig {
 
 using namespace JSC;
 
 #define REPORTED_NODE_VERSION "18.15.0"
+#define processObjectBindingCodeGenerator processObjectInternalsBindingCodeGenerator
+#define processObjectMainModuleCodeGenerator moduleMainCodeGenerator
+
+#if !defined(BUN_WEBKIT_VERSION)
+#define BUN_WEBKIT_VERSION "unknown"
+#endif
 
 using JSGlobalObject = JSC::JSGlobalObject;
 using Exception = JSC::Exception;
@@ -30,117 +58,42 @@ using JSObject = JSC::JSObject;
 using JSNonFinalObject = JSC::JSNonFinalObject;
 namespace JSCastingHelpers = JSC::JSCastingHelpers;
 
-static JSC_DECLARE_CUSTOM_SETTER(Process_setTitle);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getArgv);
-static JSC_DECLARE_CUSTOM_SETTER(Process_setArgv);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getTitle);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getVersionsLazy);
-static JSC_DECLARE_CUSTOM_SETTER(Process_setVersionsLazy);
-
-static JSC_DECLARE_CUSTOM_GETTER(Process_getPID);
-static JSC_DECLARE_CUSTOM_GETTER(Process_getPPID);
-
-static JSC_DECLARE_HOST_FUNCTION(Process_functionCwd);
+JSC_DECLARE_CUSTOM_SETTER(Process_setTitle);
+JSC_DECLARE_CUSTOM_GETTER(Process_getArgv);
+JSC_DECLARE_CUSTOM_SETTER(Process_setArgv);
+JSC_DECLARE_CUSTOM_GETTER(Process_getTitle);
+JSC_DECLARE_CUSTOM_GETTER(Process_getPID);
+JSC_DECLARE_CUSTOM_GETTER(Process_getPPID);
+JSC_DECLARE_HOST_FUNCTION(Process_functionCwd);
+static bool processIsExiting = false;
+
+extern "C" uint8_t Bun__getExitCode(void*);
+extern "C" uint8_t Bun__setExitCode(void*, uint8_t);
+extern "C" void* Bun__getVM();
+extern "C" Zig::GlobalObject* Bun__getDefaultGlobal();
+extern "C" const char* Bun__githubURL;
 
-static JSValue constructStdioWriteStream(JSC::JSGlobalObject* globalObject, int fd)
+static void dispatchExitInternal(JSC::JSGlobalObject* globalObject, Process* process, int exitCode)
 {
-    auto& vm = globalObject->vm();
-    auto scope = DECLARE_THROW_SCOPE(vm);
-    auto* thisObject = reinterpret_cast<Zig::GlobalObject*>(globalObject);
-    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdioWriteStreamCodeGenerator(vm), globalObject);
-    JSC::MarkedArgumentBuffer args;
-    WTF::String process = WTF::String("node:process"_s);
-    JSC::JSValue requireFunction = Zig::ImportMetaObject::createRequireFunction(
-        vm,
-        globalObject,
-        process);
-
-    args.append(JSC::jsNumber(fd));
-    args.append(requireFunction);
-
-    auto clientData = WebCore::clientData(vm);
-    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
-
-    NakedPtr<JSC::Exception> returnedException = nullptr;
-    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject->globalThis(), args, returnedException);
-    RETURN_IF_EXCEPTION(scope, {});
-
-    if (returnedException) {
-        throwException(globalObject, scope, returnedException.get());
-        return {};
-    }
-
-    return result;
-}
 
-JSC_DEFINE_CUSTOM_GETTER(
-    Process_lazyStdinGetter,
-    (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName property))
-{
+    if (processIsExiting)
+        return;
+    processIsExiting = true;
+    auto& emitter = process->wrapped();
     auto& vm = globalObject->vm();
-    auto scope = DECLARE_THROW_SCOPE(vm);
-    JSC::JSValue value = JSC::JSValue::decode(thisValue);
-    if (!value || value.isUndefinedOrNull() || !value.isObject())
-        return JSValue::encode(jsUndefined());
-
-    auto* thisObject = reinterpret_cast<Zig::GlobalObject*>(globalObject);
-    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdinStreamCodeGenerator(vm), globalObject);
-    JSC::MarkedArgumentBuffer args;
-    WTF::String process = WTF::String("node:process"_s);
-    JSC::JSValue requireFunction = Zig::ImportMetaObject::createRequireFunction(
-        vm,
-        globalObject,
-        process);
-
-    args.append(JSC::jsNumber(STDIN_FILENO));
-    args.append(requireFunction);
-    args.append(thisObject->get(globalObject, PropertyName(JSC::Identifier::fromString(vm, "Bun"_s))));
 
-    auto clientData = WebCore::clientData(vm);
-    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
-
-    NakedPtr<JSC::Exception> returnedException = nullptr;
-    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject->globalThis(), args, returnedException);
-    RETURN_IF_EXCEPTION(scope, {});
+    if (vm.hasTerminationRequest() || vm.hasExceptionsAfterHandlingTraps())
+        return;
 
-    if (UNLIKELY(returnedException)) {
-        throwException(globalObject, scope, returnedException.get());
-        return {};
+    auto event = Identifier::fromString(vm, "exit"_s);
+    if (!emitter.hasEventListeners(event)) {
+        return;
     }
+    process->putDirect(vm, Identifier::fromString(vm, "_exiting"_s), jsBoolean(true), 0);
 
-    if (LIKELY(result))
-        value.getObject()->putDirect(vm, property, result, 0);
-
-    return JSValue::encode(result);
-}
-
-JSC_DEFINE_CUSTOM_GETTER(
-    Process_lazyStdoutGetter,
-    (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName property))
-{
-    JSValue value = JSValue::decode(thisValue);
-    auto& vm = globalObject->vm();
-    JSC::JSObject* thisObject = value.toObject(globalObject);
-    JSC::JSValue stream = constructStdioWriteStream(globalObject, 1);
-
-    if (stream)
-        thisObject->putDirect(vm, property, stream, 0);
-
-    return JSValue::encode(stream);
-}
-
-JSC_DEFINE_CUSTOM_GETTER(
-    Process_lazyStderrGetter, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName property))
-{
-    JSValue value = JSValue::decode(thisValue);
-    auto& vm = globalObject->vm();
-    JSC::JSObject* thisObject = value.toObject(globalObject);
-    JSC::JSValue stream = constructStdioWriteStream(globalObject, 2);
-
-    if (stream)
-        thisObject->putDirect(vm, property, stream, 0);
-
-    return JSValue::encode(stream);
+    MarkedArgumentBuffer arguments;
+    arguments.append(jsNumber(exitCode));
+    emitter.emit(event, arguments);
 }
 
 JSC_DEFINE_CUSTOM_SETTER(Process_defaultSetter,
@@ -323,6 +276,29 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionUmask,
 extern "C" uint64_t Bun__readOriginTimer(void*);
 extern "C" double Bun__readOriginTimerStart(void*);
 
+// https://github.com/nodejs/node/blob/1936160c31afc9780e4365de033789f39b7cbc0c/src/api/hooks.cc#L49
+extern "C" void Process__dispatchOnBeforeExit(Zig::GlobalObject* globalObject, uint8_t exitCode)
+{
+    if (!globalObject->hasProcessObject()) {
+        return;
+    }
+
+    auto* process = jsCast<Process*>(globalObject->processObject());
+    MarkedArgumentBuffer arguments;
+    arguments.append(jsNumber(exitCode));
+    process->wrapped().emit(Identifier::fromString(globalObject->vm(), "beforeExit"_s), arguments);
+}
+
+extern "C" void Process__dispatchOnExit(Zig::GlobalObject* globalObject, uint8_t exitCode)
+{
+    if (!globalObject->hasProcessObject()) {
+        return;
+    }
+
+    auto* process = jsCast<Process*>(globalObject->processObject());
+    dispatchExitInternal(globalObject, process, exitCode);
+}
+
 JSC_DEFINE_HOST_FUNCTION(Process_functionUptime,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
@@ -335,14 +311,39 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionUptime,
 JSC_DEFINE_HOST_FUNCTION(Process_functionExit,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
-    if (callFrame->argumentCount() == 0) {
-        // TODO: exitCode
-        Bun__Process__exit(globalObject, 0);
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
+    uint8_t exitCode = 0;
+    JSValue arg0 = callFrame->argument(0);
+    if (arg0.isNumber()) {
+        if (!arg0.isInt32()) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        int extiCode32 = arg0.toInt32(globalObject);
+        RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
+
+        if (extiCode32 < 0 || extiCode32 > 127) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer between 0 and 127"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        exitCode = static_cast<uint8_t>(extiCode32);
+    } else if (!arg0.isUndefinedOrNull()) {
+        throwTypeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+        return JSC::JSValue::encode(JSC::JSValue {});
     } else {
-        Bun__Process__exit(globalObject, callFrame->argument(0).toInt32(globalObject));
+        exitCode = Bun__getExitCode(Bun__getVM());
+    }
+
+    auto* zigGlobal = jsDynamicCast<Zig::GlobalObject*>(globalObject);
+    if (UNLIKELY(!zigGlobal)) {
+        zigGlobal = Bun__getDefaultGlobal();
     }
 
-    return JSC::JSValue::encode(JSC::jsUndefined());
+    Process__dispatchOnExit(zigGlobal, exitCode);
+    Bun__Process__exit(zigGlobal, exitCode);
+    __builtin_unreachable();
 }
 
 extern "C" uint64_t Bun__readOriginTimer(void*);
@@ -350,9 +351,12 @@ extern "C" uint64_t Bun__readOriginTimer(void*);
 JSC_DEFINE_HOST_FUNCTION(Process_functionHRTime,
     (JSC::JSGlobalObject * globalObject_, JSC::CallFrame* callFrame))
 {
+
     Zig::GlobalObject* globalObject
         = reinterpret_cast<Zig::GlobalObject*>(globalObject_);
     auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
     uint64_t time = Bun__readOriginTimer(globalObject->bunVM());
     int64_t seconds = static_cast<int64_t>(time / 1000000000);
     int64_t nanoseconds = time % 1000000000;
@@ -361,7 +365,6 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionHRTime,
         JSC::JSValue arg0 = callFrame->uncheckedArgument(0);
         if (!arg0.isUndefinedOrNull()) {
             JSArray* relativeArray = JSC::jsDynamicCast<JSC::JSArray*>(arg0);
-            auto throwScope = DECLARE_THROW_SCOPE(vm);
             if ((!relativeArray && !arg0.isUndefinedOrNull()) || relativeArray->length() < 2) {
                 JSC::throwTypeError(globalObject, throwScope, "hrtime() argument must be an array or undefined"_s);
                 return JSC::JSValue::encode(JSC::JSValue {});
@@ -381,27 +384,38 @@ JSC_DEFINE_HOST_FUNCTION(Process_functionHRTime,
                 seconds--;
                 nanoseconds += 1000000000;
             }
-            throwScope.release();
         }
     }
 
-    auto* array = JSArray::create(vm, globalObject->originalArrayStructureForIndexingType(ArrayWithContiguous), 2);
-    array->setIndexQuickly(vm, 0, JSC::jsNumber(seconds));
-    array->setIndexQuickly(vm, 1, JSC::jsNumber(nanoseconds));
-    return JSC::JSValue::encode(JSC::JSValue(array));
+    JSC::JSArray* array = nullptr;
+    {
+        JSC::ObjectInitializationScope initializationScope(vm);
+        if ((array = JSC::JSArray::tryCreateUninitializedRestricted(
+                 initializationScope, nullptr,
+                 globalObject->arrayStructureForIndexingTypeDuringAllocation(JSC::ArrayWithContiguous),
+                 2))) {
+
+            array->initializeIndex(initializationScope, 0, JSC::jsNumber(seconds));
+            array->initializeIndex(initializationScope, 1, JSC::jsNumber(nanoseconds));
+        }
+    }
+
+    if (UNLIKELY(!array)) {
+        JSC::throwOutOfMemoryError(globalObject, throwScope);
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
+
+    RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(array));
 }
-static JSC_DECLARE_HOST_FUNCTION(Process_functionHRTimeBigInt);
 
-static JSC_DEFINE_HOST_FUNCTION(Process_functionHRTimeBigInt,
+JSC_DEFINE_HOST_FUNCTION(Process_functionHRTimeBigInt,
     (JSC::JSGlobalObject * globalObject_, JSC::CallFrame* callFrame))
 {
     Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(globalObject_);
     return JSC::JSValue::encode(JSValue(JSC::JSBigInt::createFrom(globalObject, Bun__readOriginTimer(globalObject->bunVM()))));
 }
 
-static JSC_DECLARE_HOST_FUNCTION(Process_functionChdir);
-
-static JSC_DEFINE_HOST_FUNCTION(Process_functionChdir,
+JSC_DEFINE_HOST_FUNCTION(Process_functionChdir,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
@@ -423,119 +437,220 @@ static JSC_DEFINE_HOST_FUNCTION(Process_functionChdir,
     return JSC::JSValue::encode(result);
 }
 
-extern "C" const char* Bun__githubURL;
-
-JSC_DEFINE_CUSTOM_GETTER(Process_getterRelease, (JSGlobalObject * globalObject, EncodedJSValue thisValue, PropertyName))
+static HashMap<String, int>* signalNameToNumberMap = nullptr;
+static HashMap<int, String>* signalNumberToNameMap = nullptr;
+
+// signal number to array of script execution context ids that care about the signal
+static HashMap<int, HashSet<uint32_t>>* signalToContextIdsMap = nullptr;
+static Lock signalToContextIdsMapLock;
+
+static const NeverDestroyed<String> signalNames[] = {
+    MAKE_STATIC_STRING_IMPL("SIGHUP"),
+    MAKE_STATIC_STRING_IMPL("SIGINT"),
+    MAKE_STATIC_STRING_IMPL("SIGQUIT"),
+    MAKE_STATIC_STRING_IMPL("SIGILL"),
+    MAKE_STATIC_STRING_IMPL("SIGTRAP"),
+    MAKE_STATIC_STRING_IMPL("SIGABRT"),
+    MAKE_STATIC_STRING_IMPL("SIGIOT"),
+    MAKE_STATIC_STRING_IMPL("SIGBUS"),
+    MAKE_STATIC_STRING_IMPL("SIGFPE"),
+    MAKE_STATIC_STRING_IMPL("SIGKILL"),
+    MAKE_STATIC_STRING_IMPL("SIGUSR1"),
+    MAKE_STATIC_STRING_IMPL("SIGSEGV"),
+    MAKE_STATIC_STRING_IMPL("SIGUSR2"),
+    MAKE_STATIC_STRING_IMPL("SIGPIPE"),
+    MAKE_STATIC_STRING_IMPL("SIGALRM"),
+    MAKE_STATIC_STRING_IMPL("SIGTERM"),
+    MAKE_STATIC_STRING_IMPL("SIGCHLD"),
+    MAKE_STATIC_STRING_IMPL("SIGCONT"),
+    MAKE_STATIC_STRING_IMPL("SIGSTOP"),
+    MAKE_STATIC_STRING_IMPL("SIGTSTP"),
+    MAKE_STATIC_STRING_IMPL("SIGTTIN"),
+    MAKE_STATIC_STRING_IMPL("SIGTTOU"),
+    MAKE_STATIC_STRING_IMPL("SIGURG"),
+    MAKE_STATIC_STRING_IMPL("SIGXCPU"),
+    MAKE_STATIC_STRING_IMPL("SIGXFSZ"),
+    MAKE_STATIC_STRING_IMPL("SIGVTALRM"),
+    MAKE_STATIC_STRING_IMPL("SIGPROF"),
+    MAKE_STATIC_STRING_IMPL("SIGWINCH"),
+    MAKE_STATIC_STRING_IMPL("SIGIO"),
+    MAKE_STATIC_STRING_IMPL("SIGINFO"),
+    MAKE_STATIC_STRING_IMPL("SIGSYS"),
+};
+
+static void loadSignalNumberMap()
 {
-    auto& vm = globalObject->vm();
 
-    auto* release = JSC::constructEmptyObject(globalObject);
-    release->putDirect(vm, Identifier::fromString(vm, "name"_s), jsString(vm, WTF::String("bun"_s)), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "lts"_s), jsBoolean(false), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "sourceUrl"_s), jsString(vm, WTF::String(Bun__githubURL, strlen(Bun__githubURL))), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "headersUrl"_s), jsEmptyString(vm), 0);
-    release->putDirect(vm, Identifier::fromString(vm, "libUrl"_s), jsEmptyString(vm), 0);
+    static std::once_flag signalNameToNumberMapOnceFlag;
+    std::call_once(signalNameToNumberMapOnceFlag, [] {
+        signalNameToNumberMap = new HashMap<String, int>();
+        signalNameToNumberMap->reserveInitialCapacity(31);
+        signalNameToNumberMap->add(signalNames[0], SIGHUP);
+        signalNameToNumberMap->add(signalNames[1], SIGINT);
+        signalNameToNumberMap->add(signalNames[2], SIGQUIT);
+        signalNameToNumberMap->add(signalNames[3], SIGILL);
+        signalNameToNumberMap->add(signalNames[4], SIGTRAP);
+        signalNameToNumberMap->add(signalNames[5], SIGABRT);
+        signalNameToNumberMap->add(signalNames[6], SIGIOT);
+        signalNameToNumberMap->add(signalNames[7], SIGBUS);
+        signalNameToNumberMap->add(signalNames[8], SIGFPE);
+        // signalNameToNumberMap->add(signalNames[9], SIGKILL);
+        signalNameToNumberMap->add(signalNames[10], SIGUSR1);
+        signalNameToNumberMap->add(signalNames[11], SIGSEGV);
+        signalNameToNumberMap->add(signalNames[12], SIGUSR2);
+        signalNameToNumberMap->add(signalNames[13], SIGPIPE);
+        signalNameToNumberMap->add(signalNames[14], SIGALRM);
+        signalNameToNumberMap->add(signalNames[15], SIGTERM);
+        signalNameToNumberMap->add(signalNames[16], SIGCHLD);
+        signalNameToNumberMap->add(signalNames[17], SIGCONT);
+        // signalNameToNumberMap->add(signalNames[18], SIGSTOP);
+        signalNameToNumberMap->add(signalNames[19], SIGTSTP);
+        signalNameToNumberMap->add(signalNames[20], SIGTTIN);
+        signalNameToNumberMap->add(signalNames[21], SIGTTOU);
+        signalNameToNumberMap->add(signalNames[22], SIGURG);
+        signalNameToNumberMap->add(signalNames[23], SIGXCPU);
+        signalNameToNumberMap->add(signalNames[24], SIGXFSZ);
+        signalNameToNumberMap->add(signalNames[25], SIGVTALRM);
+        signalNameToNumberMap->add(signalNames[26], SIGPROF);
+        signalNameToNumberMap->add(signalNames[27], SIGWINCH);
+        signalNameToNumberMap->add(signalNames[28], SIGIO);
+#ifdef SIGINFO
+        signalNameToNumberMap->add(signalNames[29], SIGINFO);
+#endif
 
-    return JSValue::encode(release);
+#ifndef SIGINFO
+        signalNameToNumberMap->add(signalNames[29], 255);
+#endif
+        signalNameToNumberMap->add(signalNames[30], SIGSYS);
+    });
 }
 
-JSC_DEFINE_CUSTOM_SETTER(Process_setterRelease,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName))
+static void onDidChangeListeners(EventEmitter& eventEmitter, const Identifier& eventName, bool isAdded)
 {
-    JSC::VM& vm = globalObject->vm();
+    loadSignalNumberMap();
+
+    static std::once_flag signalNumberToNameMapOnceFlag;
+    std::call_once(signalNumberToNameMapOnceFlag, [] {
+        signalNumberToNameMap = new HashMap<int, String>();
+        signalNumberToNameMap->reserveInitialCapacity(31);
+        signalNumberToNameMap->add(SIGHUP, signalNames[0]);
+        signalNumberToNameMap->add(SIGINT, signalNames[1]);
+        signalNumberToNameMap->add(SIGQUIT, signalNames[2]);
+        signalNumberToNameMap->add(SIGILL, signalNames[3]);
+        signalNumberToNameMap->add(SIGTRAP, signalNames[4]);
+        signalNumberToNameMap->add(SIGABRT, signalNames[5]);
+        signalNumberToNameMap->add(SIGIOT, signalNames[6]);
+        signalNumberToNameMap->add(SIGBUS, signalNames[7]);
+        signalNumberToNameMap->add(SIGFPE, signalNames[8]);
+        // signalNumberToNameMap->add(SIGKILL, signalNames[9]);
+        signalNumberToNameMap->add(SIGUSR1, signalNames[10]);
+        signalNumberToNameMap->add(SIGSEGV, signalNames[11]);
+        signalNumberToNameMap->add(SIGUSR2, signalNames[12]);
+        signalNumberToNameMap->add(SIGPIPE, signalNames[13]);
+        signalNumberToNameMap->add(SIGALRM, signalNames[14]);
+        signalNumberToNameMap->add(SIGTERM, signalNames[15]);
+        signalNumberToNameMap->add(SIGCHLD, signalNames[16]);
+        signalNumberToNameMap->add(SIGCONT, signalNames[17]);
+        // signalNumberToNameMap->add(SIGSTOP, signalNames[18]);
+        signalNumberToNameMap->add(SIGTSTP, signalNames[19]);
+        signalNumberToNameMap->add(SIGTTIN, signalNames[20]);
+        signalNumberToNameMap->add(SIGTTOU, signalNames[21]);
+        signalNumberToNameMap->add(SIGURG, signalNames[22]);
+        signalNumberToNameMap->add(SIGXCPU, signalNames[23]);
+        signalNumberToNameMap->add(SIGXFSZ, signalNames[24]);
+        signalNumberToNameMap->add(SIGVTALRM, signalNames[25]);
+        signalNumberToNameMap->add(SIGPROF, signalNames[26]);
+        signalNumberToNameMap->add(SIGWINCH, signalNames[27]);
+        signalNumberToNameMap->add(SIGIO, signalNames[28]);
+#ifdef SIGINFO
+        signalNameToNumberMap->add(signalNames[29], SIGINFO);
+#endif
+        signalNumberToNameMap->add(SIGSYS, signalNames[30]);
+    });
 
-    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
-    thisObject->putDirect(vm, JSC::Identifier::fromString(vm, "release"_s), JSValue::decode(value), 0);
+    if (!signalToContextIdsMap) {
+        signalToContextIdsMap = new HashMap<int, HashSet<uint32_t>>();
+    }
 
-    return true;
+    if (isAdded) {
+        if (auto signalNumber = signalNameToNumberMap->get(eventName.string())) {
+            uint32_t contextId = eventEmitter.scriptExecutionContext()->identifier();
+            Locker lock { signalToContextIdsMapLock };
+            if (!signalToContextIdsMap->contains(signalNumber)) {
+                HashSet<uint32_t> contextIds;
+                contextIds.add(contextId);
+                signalToContextIdsMap->set(signalNumber, contextIds);
+
+                lock.unlockEarly();
+
+                struct sigaction action;
+                memset(&action, 0, sizeof(struct sigaction));
+
+                // Set the handler in the action struct
+                action.sa_handler = [](int signalNumber) {
+                    if (UNLIKELY(signalNumberToNameMap->find(signalNumber) == signalNumberToNameMap->end()))
+                        return;
+
+                    Locker lock { signalToContextIdsMapLock };
+                    if (UNLIKELY(signalToContextIdsMap->find(signalNumber) == signalToContextIdsMap->end()))
+                        return;
+                    auto contextIds = signalToContextIdsMap->get(signalNumber);
+
+                    for (int contextId : contextIds) {
+                        auto* context = ScriptExecutionContext::getScriptExecutionContext(contextId);
+                        if (UNLIKELY(!context))
+                            continue;
+
+                        JSGlobalObject* lexicalGlobalObject = context->jsGlobalObject();
+                        Zig::GlobalObject* globalObject = static_cast<Zig::GlobalObject*>(lexicalGlobalObject);
+
+                        Process* process = jsCast<Process*>(globalObject->processObject());
+
+                        context->postCrossThreadTask(*process, &Process::emitSignalEvent, signalNumber);
+                    }
+                };
+
+                // Clear the sa_mask
+                sigemptyset(&action.sa_mask);
+                sigaddset(&action.sa_mask, signalNumber);
+                action.sa_flags = SA_RESTART;
+
+                sigaction(signalNumber, &action, nullptr);
+            } else {
+                auto contextIds = signalToContextIdsMap->get(signalNumber);
+                contextIds.add(contextId);
+                signalToContextIdsMap->set(signalNumber, contextIds);
+            }
+        }
+    } else {
+        if (auto signalNumber = signalNameToNumberMap->get(eventName.string())) {
+            uint32_t contextId = eventEmitter.scriptExecutionContext()->identifier();
+            Locker lock { signalToContextIdsMapLock };
+            if (signalToContextIdsMap->find(signalNumber) != signalToContextIdsMap->end()) {
+                HashSet<uint32_t> contextIds = signalToContextIdsMap->get(signalNumber);
+                contextIds.remove(contextId);
+                if (contextIds.isEmpty()) {
+                    signal(signalNumber, SIG_DFL);
+                    signalToContextIdsMap->remove(signalNumber);
+                } else {
+                    signalToContextIdsMap->set(signalNumber, contextIds);
+                }
+            }
+        }
+    }
 }
 
-// static const NeverDestroyed<String> signalNames[] = {
-//     MAKE_STATIC_STRING_IMPL("SIGHUP"),
-//     MAKE_STATIC_STRING_IMPL("SIGINT"),
-//     MAKE_STATIC_STRING_IMPL("SIGQUIT"),
-//     MAKE_STATIC_STRING_IMPL("SIGILL"),
-//     MAKE_STATIC_STRING_IMPL("SIGTRAP"),
-//     MAKE_STATIC_STRING_IMPL("SIGABRT"),
-//     MAKE_STATIC_STRING_IMPL("SIGIOT"),
-//     MAKE_STATIC_STRING_IMPL("SIGBUS"),
-//     MAKE_STATIC_STRING_IMPL("SIGFPE"),
-//     MAKE_STATIC_STRING_IMPL("SIGKILL"),
-//     MAKE_STATIC_STRING_IMPL("SIGUSR1"),
-//     MAKE_STATIC_STRING_IMPL("SIGSEGV"),
-//     MAKE_STATIC_STRING_IMPL("SIGUSR2"),
-//     MAKE_STATIC_STRING_IMPL("SIGPIPE"),
-//     MAKE_STATIC_STRING_IMPL("SIGALRM"),
-//     MAKE_STATIC_STRING_IMPL("SIGTERM"),
-//     MAKE_STATIC_STRING_IMPL("SIGCHLD"),
-//     MAKE_STATIC_STRING_IMPL("SIGCONT"),
-//     MAKE_STATIC_STRING_IMPL("SIGSTOP"),
-//     MAKE_STATIC_STRING_IMPL("SIGTSTP"),
-//     MAKE_STATIC_STRING_IMPL("SIGTTIN"),
-//     MAKE_STATIC_STRING_IMPL("SIGTTOU"),
-//     MAKE_STATIC_STRING_IMPL("SIGURG"),
-//     MAKE_STATIC_STRING_IMPL("SIGXCPU"),
-//     MAKE_STATIC_STRING_IMPL("SIGXFSZ"),
-//     MAKE_STATIC_STRING_IMPL("SIGVTALRM"),
-//     MAKE_STATIC_STRING_IMPL("SIGPROF"),
-//     MAKE_STATIC_STRING_IMPL("SIGWINCH"),
-//     MAKE_STATIC_STRING_IMPL("SIGIO"),
-//     MAKE_STATIC_STRING_IMPL("SIGINFO"),
-//     MAKE_STATIC_STRING_IMPL("SIGSYS"),
-// };
-// static const int signalNumbers[] = {
-//     SIGHUP,
-//     SIGINT,
-//     SIGQUIT,
-//     SIGILL,
-//     SIGTRAP,
-//     SIGABRT,
-//     SIGIOT,
-//     SIGBUS,
-//     SIGFPE,
-//     SIGKILL,
-//     SIGUSR1,
-//     SIGSEGV,
-//     SIGUSR2,
-//     SIGPIPE,
-//     SIGALRM,
-//     SIGTERM,
-//     SIGCHLD,
-//     SIGCONT,
-//     SIGSTOP,
-//     SIGTSTP,
-//     SIGTTIN,
-//     SIGTTOU,
-//     SIGURG,
-//     SIGXCPU,
-//     SIGXFSZ,
-//     SIGVTALRM,
-//     SIGPROF,
-//     SIGWINCH,
-//     SIGIO,
-//     SIGINFO,
-//     SIGSYS,
-// };
-
-// JSC_DEFINE_HOST_FUNCTION(jsFunctionProcessOn, (JSGlobalObject * globalObject, CallFrame* callFrame))
-// {
-//     VM& vm = globalObject->vm();
-//     auto scope = DECLARE_THROW_SCOPE(vm);
-
-//     if (callFrame->argumentCount() < 2) {
-//         throwVMError(globalObject, scope, "Not enough arguments"_s);
-//         return JSValue::encode(jsUndefined());
-//     }
-
-//     String eventName = callFrame->uncheckedArgument(0).toWTFString(globalObject);
-//     RETURN_IF_EXCEPTION(scope, encodedJSValue());
-// }
+void Process::emitSignalEvent(int signalNumber)
+{
+    String signalName = signalNumberToNameMap->get(signalNumber);
+    Identifier signalNameIdentifier = Identifier::fromString(vm(), signalName);
+    MarkedArgumentBuffer args;
+    args.append(jsNumber(signalNumber));
+    wrapped().emitForBindings(signalNameIdentifier, args);
+}
 
 Process::~Process()
 {
-    for (auto& listener : this->wrapped().eventListenerMap().entries()) {
-    }
 }
 
 JSC_DEFINE_HOST_FUNCTION(Process_functionAbort, (JSGlobalObject * globalObject, CallFrame*))
@@ -559,201 +674,123 @@ JSC_DEFINE_HOST_FUNCTION(Process_emitWarning, (JSGlobalObject * lexicalGlobalObj
 
     auto* process = jsCast<Process*>(globalObject->processObject());
 
-    auto getError = [&]() -> JSValue {
+    JSObject* errorInstance = ([&]() -> JSObject* {
         JSValue arg0 = callFrame->uncheckedArgument(0);
         if (!arg0.isEmpty() && arg0.isCell() && arg0.asCell()->type() == ErrorInstanceType) {
-            return arg0;
+            return arg0.getObject();
         }
 
         WTF::String str = arg0.toWTFString(globalObject);
         return createError(globalObject, str);
-    };
+    })();
+
+    errorInstance->putDirect(vm, Identifier::fromString(vm, "name"_s), jsString(vm, String("warn"_s)), JSC::PropertyAttribute::DontEnum | 0);
 
     auto ident = Identifier::fromString(vm, "warning"_s);
     if (process->wrapped().hasEventListeners(ident)) {
         JSC::MarkedArgumentBuffer args;
-        args.append(getError());
+        args.append(errorInstance);
 
         process->wrapped().emit(ident, args);
         return JSValue::encode(jsUndefined());
     }
 
-    auto jsArgs = JSValue::encode(getError());
+    auto jsArgs = JSValue::encode(errorInstance);
     Zig__ConsoleClient__messageWithTypeAndLevel(reinterpret_cast<Zig::ConsoleClient*>(globalObject->consoleClient().get())->m_client, static_cast<uint32_t>(MessageType::Log),
         static_cast<uint32_t>(MessageLevel::Warning), globalObject, &jsArgs, 1);
     return JSValue::encode(jsUndefined());
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_lazyArgv0Getter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
+JSC_DEFINE_CUSTOM_GETTER(processExitCode, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
 {
-    JSC::JSObject* thisObject = JSValue::decode(thisValue).getObject();
-    EncodedJSValue ret = Bun__Process__getArgv0(globalObject);
-
-    if (LIKELY(thisObject)) {
-        thisObject->putDirect(globalObject->vm(), name, JSValue::decode(ret), 0);
+    Process* process = jsDynamicCast<Process*>(JSValue::decode(thisValue));
+    if (!process) {
+        return JSValue::encode(jsUndefined());
     }
 
-    return ret;
+    return JSValue::encode(jsNumber(Bun__getExitCode(jsCast<Zig::GlobalObject*>(process->globalObject())->bunVM())));
 }
-
-JSC_DEFINE_CUSTOM_GETTER(Process_lazyExecArgvGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
+JSC_DEFINE_CUSTOM_SETTER(setProcessExitCode, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::EncodedJSValue thisValue, JSC::EncodedJSValue value, JSC::PropertyName))
 {
-    JSC::JSObject* thisObject = JSValue::decode(thisValue).getObject();
-    EncodedJSValue ret = Bun__Process__getExecArgv(globalObject);
-
-    if (LIKELY(thisObject)) {
-        thisObject->putDirect(globalObject->vm(), name, JSValue::decode(ret), 0);
+    Process* process = jsDynamicCast<Process*>(JSValue::decode(thisValue));
+    if (!process) {
+        return false;
     }
 
-    return ret;
-}
+    auto throwScope = DECLARE_THROW_SCOPE(process->vm());
+    JSValue exitCode = JSValue::decode(value);
+    if (!exitCode.isNumber()) {
+        throwTypeError(lexicalGlobalObject, throwScope, "exitCode must be a number"_s);
+        return false;
+    }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_lazyExecPathGetter, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName name))
-{
-    JSC::JSObject* thisObject = JSValue::decode(thisValue).getObject();
-    EncodedJSValue ret = Bun__Process__getExecPath(globalObject);
+    if (!exitCode.isInt32()) {
+        throwRangeError(lexicalGlobalObject, throwScope, "The \"code\" argument must be an integer"_s);
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
 
-    if (LIKELY(thisObject)) {
-        thisObject->putDirect(globalObject->vm(), name, JSValue::decode(ret), 0);
+    int exitCodeInt = exitCode.toInt32(lexicalGlobalObject);
+    RETURN_IF_EXCEPTION(throwScope, false);
+    if (exitCodeInt < 0 || exitCodeInt > 127) {
+        throwRangeError(lexicalGlobalObject, throwScope, "exitCode must be between 0 and 127"_s);
+        return false;
     }
 
-    return ret;
+    void* ptr = jsCast<Zig::GlobalObject*>(process->globalObject())->bunVM();
+    Bun__setExitCode(ptr, static_cast<uint8_t>(exitCodeInt));
+    return true;
 }
 
-void Process::finishCreation(JSC::VM& vm)
+static JSValue constructVersions(VM& vm, JSObject* processObject)
 {
-    Base::finishCreation(vm);
-    auto clientData = WebCore::clientData(vm);
-    auto* globalObject = reinterpret_cast<Zig::GlobalObject*>(this->globalObject());
-
-    putDirectCustomAccessor(vm, clientData->builtinNames().pidPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getPID, nullptr),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirectCustomAccessor(vm, clientData->builtinNames().ppidPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getPPID, nullptr),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "title"_s),
-        JSC::CustomGetterSetter::create(vm, Process_getTitle, Process_setTitle),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirectCustomAccessor(vm, clientData->builtinNames().argvPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getArgv, Process_setArgv),
-        static_cast<unsigned>(JSC::PropertyAttribute::CustomValue));
-
-    putDirect(vm, JSC::Identifier::fromString(vm, "revision"_s),
-        JSC::jsString(vm, makeAtomString(Bun__version_sha)), 0);
-
-    this->putDirect(vm, clientData->builtinNames().nextTickPublicName(),
-        JSC::JSFunction::create(vm, globalObject, 1,
-            MAKE_STATIC_STRING_IMPL("nextTick"), Process_functionNextTick, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, JSC::Identifier::fromString(vm, "dlopen"_s),
-        JSC::JSFunction::create(vm, globalObject, 1,
-            MAKE_STATIC_STRING_IMPL("dlopen"), Process_functionDlopen, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, clientData->builtinNames().cwdPublicName(),
-        JSC::JSFunction::create(vm, globalObject, 0,
-            MAKE_STATIC_STRING_IMPL("cwd"), Process_functionCwd, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, clientData->builtinNames().chdirPublicName(),
-        JSC::JSFunction::create(vm, globalObject, 0,
-            MAKE_STATIC_STRING_IMPL("chdir"), Process_functionChdir, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    this->putDirect(vm, JSC::Identifier::fromString(vm, "exit"_s),
-        JSC::JSFunction::create(vm, globalObject, 0,
-            MAKE_STATIC_STRING_IMPL("exit"), Process_functionExit, ImplementationVisibility::Public),
-        PropertyAttribute::Function | 0);
-
-    putDirectCustomAccessor(
-        vm, clientData->builtinNames().versionsPublicName(),
-        JSC::CustomGetterSetter::create(vm, Process_getVersionsLazy, Process_setVersionsLazy), 0);
-    // this should be transpiled out, but just incase
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "browser"_s),
-        JSC::JSValue(false));
-
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "exitCode"_s),
-        JSC::JSValue(JSC::jsNumber(0)));
-
-    this->putDirect(this->vm(), clientData->builtinNames().versionPublicName(),
-        JSC::jsString(this->vm(), makeString("v", REPORTED_NODE_VERSION)));
-
-    // this gives some way of identifying at runtime whether the SSR is happening in node or not.
-    // this should probably be renamed to what the name of the bundler is, instead of "notNodeJS"
-    // but it must be something that won't evaluate to truthy in Node.js
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "isBun"_s), JSC::JSValue(true));
-#if defined(__APPLE__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "platform"_s),
-        JSC::jsString(this->vm(), makeAtomString("darwin")));
-#else
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "platform"_s),
-        JSC::jsString(this->vm(), makeAtomString("linux")));
-#endif
-
-#if defined(__x86_64__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("x64")));
-#elif defined(__i386__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("x86")));
-#elif defined(__arm__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("arm")));
-#elif defined(__aarch64__)
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "arch"_s),
-        JSC::jsString(this->vm(), makeAtomString("arm64")));
-#endif
-
-    JSC::JSFunction* hrtime = JSC::JSFunction::create(vm, globalObject, 0,
-        MAKE_STATIC_STRING_IMPL("hrtime"), Process_functionHRTime, ImplementationVisibility::Public);
-
-    JSC::JSFunction* hrtimeBigInt = JSC::JSFunction::create(vm, globalObject, 0,
-        MAKE_STATIC_STRING_IMPL("bigint"), Process_functionHRTimeBigInt, ImplementationVisibility::Public);
-
-    hrtime->putDirect(vm, JSC::Identifier::fromString(vm, "bigint"_s), hrtimeBigInt);
-    this->putDirect(this->vm(), JSC::Identifier::fromString(this->vm(), "hrtime"_s), hrtime);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "release"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_getterRelease, Process_setterRelease), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "stdout"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyStdoutGetter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "stderr"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyStderrGetter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "stdin"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyStdinGetter, Process_defaultSetter), 0);
-
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "abort"_s),
-        0, Process_functionAbort, ImplementationVisibility::Public, NoIntrinsic, 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "argv0"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyArgv0Getter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "execPath"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyExecPathGetter, Process_defaultSetter), 0);
-
-    this->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "execArgv"_s)),
-        JSC::CustomGetterSetter::create(vm, Process_lazyExecArgvGetter, Process_defaultSetter), 0);
+    auto* globalObject = processObject->globalObject();
+    JSC::JSObject* object = JSC::constructEmptyObject(globalObject, globalObject->objectPrototype(), 19);
 
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "uptime"_s),
-        0, Process_functionUptime, ImplementationVisibility::Public, NoIntrinsic, 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "node"_s),
+        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(REPORTED_NODE_VERSION))));
+    object->putDirect(
+        vm, JSC::Identifier::fromString(vm, "bun"_s),
+        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(Bun__version + 1 /* prefix with v */))));
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
+        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(BUN_WEBKIT_VERSION))));
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "boringssl"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_boringssl))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "libarchive"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_libarchive))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "mimalloc"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_mimalloc))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "picohttpparser"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_picohttpparser))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "uwebsockets"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_uws))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_webkit))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "zig"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zig))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "zlib"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zlib))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "tinycc"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_tinycc))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "lolhtml"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_lolhtml))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "ares"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_c_ares))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "usockets"_s),
+        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_usockets))), 0);
 
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "umask"_s),
-        1, Process_functionUmask, ImplementationVisibility::Public, NoIntrinsic, 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "v8"_s), JSValue(JSC::jsString(vm, makeString("10.8.168.20-node.8"_s))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "uv"_s), JSValue(JSC::jsString(vm, makeString("1.44.2"_s))), 0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "napi"_s), JSValue(JSC::jsString(vm, makeString("8"_s))), 0);
 
-    this->putDirectBuiltinFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "binding"_s),
-        processObjectInternalsBindingCodeGenerator(vm),
-        0);
+    object->putDirect(vm, JSC::Identifier::fromString(vm, "modules"_s),
+        JSC::JSValue(JSC::jsString(vm, makeAtomString("108"))));
 
-    this->putDirect(vm, vm.propertyNames->toStringTagSymbol, jsString(vm, String("process"_s)), 0);
+    return object;
+}
 
+static JSValue constructProcessConfigObject(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
     //   target_defaults:
     //    { cflags: [],
     //      default_configuration: 'Release',
@@ -783,168 +820,779 @@ void Process::finishCreation(JSC::VM& vm)
         JSC::jsNumber(1), 0);
     config->putDirect(vm, JSC::Identifier::fromString(vm, "target_defaults"_s), JSC::constructEmptyObject(globalObject), 0);
     config->putDirect(vm, JSC::Identifier::fromString(vm, "variables"_s), variables, 0);
-    this->putDirect(vm, JSC::Identifier::fromString(vm, "config"_s), config, 0);
 
-    this->putDirectNativeFunction(vm, globalObject, JSC::Identifier::fromString(this->vm(), "emitWarning"_s),
-        1, Process_emitWarning, ImplementationVisibility::Public, NoIntrinsic, 0);
+    return config;
 }
 
-const JSC::ClassInfo Process::s_info = { "Process"_s, &Base::s_info, nullptr, nullptr,
-    CREATE_METHOD_TABLE(Process) };
+static JSValue constructProcessReleaseObject(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    auto* release = JSC::constructEmptyObject(globalObject);
+    release->putDirect(vm, Identifier::fromString(vm, "name"_s), jsString(vm, WTF::String("bun"_s)), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "lts"_s), jsBoolean(false), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "sourceUrl"_s), jsString(vm, WTF::String(Bun__githubURL, strlen(Bun__githubURL))), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "headersUrl"_s), jsEmptyString(vm), 0);
+    release->putDirect(vm, Identifier::fromString(vm, "libUrl"_s), jsEmptyString(vm), 0);
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getTitle, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+    return release;
+}
+
+static JSValue constructProcessHrtimeObject(VM& vm, JSObject* processObject)
 {
-    ZigString str;
-    Bun__Process__getTitle(globalObject, &str);
-    return JSValue::encode(Zig::toJSStringValue(str, globalObject));
+    auto* globalObject = processObject->globalObject();
+    JSC::JSFunction* hrtime = JSC::JSFunction::create(vm, globalObject, 0,
+        String("hrtime"_s), Process_functionHRTime, ImplementationVisibility::Public);
+
+    JSC::JSFunction* hrtimeBigInt = JSC::JSFunction::create(vm, globalObject, 0,
+        String("bigint"_s), Process_functionHRTimeBigInt, ImplementationVisibility::Public);
+
+    hrtime->putDirect(vm, JSC::Identifier::fromString(vm, "bigint"_s), hrtimeBigInt);
+
+    return hrtime;
 }
 
-JSC_DEFINE_CUSTOM_SETTER(Process_setTitle,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName))
+static JSValue constructStdioWriteStream(JSC::JSGlobalObject* globalObject, int fd)
+{
+    auto& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdioWriteStreamCodeGenerator(vm), globalObject);
+    JSC::MarkedArgumentBuffer args;
+    args.append(JSC::jsNumber(fd));
+
+    auto clientData = WebCore::clientData(vm);
+    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
+
+    NakedPtr<JSC::Exception> returnedException = nullptr;
+    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject->globalThis(), args, returnedException);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    if (returnedException) {
+        throwException(globalObject, scope, returnedException.get());
+        return {};
+    }
+
+    return result;
+}
+
+static JSValue constructStdout(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = Bun__getDefaultGlobal();
+    return constructStdioWriteStream(globalObject, 1);
+}
+
+static JSValue constructStderr(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = Bun__getDefaultGlobal();
+    return constructStdioWriteStream(globalObject, 2);
+}
+
+static JSValue constructStdin(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = Bun__getDefaultGlobal();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    auto* thisObject = reinterpret_cast<Zig::GlobalObject*>(globalObject);
+    JSC::JSFunction* getStdioWriteStream = JSC::JSFunction::create(vm, processObjectInternalsGetStdinStreamCodeGenerator(vm), globalObject);
+    JSC::MarkedArgumentBuffer args;
+    args.append(JSC::jsNumber(STDIN_FILENO));
+
+    auto clientData = WebCore::clientData(vm);
+    JSC::CallData callData = JSC::getCallData(getStdioWriteStream);
+
+    NakedPtr<JSC::Exception> returnedException = nullptr;
+    auto result = JSC::call(globalObject, getStdioWriteStream, callData, globalObject, args, returnedException);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    if (UNLIKELY(returnedException)) {
+        throwException(globalObject, scope, returnedException.get());
+        return {};
+    }
+
+    RELEASE_AND_RETURN(scope, result);
+}
+
+static JSValue constructPid(VM& vm, JSObject* processObject)
+{
+    return jsNumber(getpid());
+}
+
+static JSValue constructPpid(VM& vm, JSObject* processObject)
+{
+    return jsNumber(getppid());
+}
+
+static JSValue constructArgv0(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getArgv0(globalObject));
+}
+
+static JSValue constructExecArgv(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getExecArgv(globalObject));
+}
+
+static JSValue constructExecPath(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getExecPath(globalObject));
+}
+
+static JSValue constructArgv(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSValue::decode(Bun__Process__getArgv(globalObject));
+}
+
+static JSValue constructArch(VM& vm, JSObject* processObject)
+{
+#if defined(__x86_64__)
+    return JSC::jsString(vm, makeAtomString("x64"));
+#elif defined(__i386__)
+    return JSC::jsString(vm, makeAtomString("x86"));
+#elif defined(__arm__)
+    return JSC::jsString(vm, makeAtomString("arm"));
+#elif defined(__aarch64__)
+    return JSC::jsString(vm, makeAtomString("arm64"));
+#else
+#error "Unknown architecture"
+#endif
+}
+
+static JSValue constructPlatform(VM& vm, JSObject* processObject)
+{
+#if defined(__APPLE__)
+    return JSC::jsString(vm, makeAtomString("darwin"));
+#elif defined(__linux__)
+    return JSC::jsString(vm, makeAtomString("linux"));
+#else
+#error "Unknown platform"
+#endif
+}
+
+static JSValue constructBrowser(VM& vm, JSObject* processObject)
+{
+    return jsBoolean(false);
+}
+
+static JSValue constructVersion(VM& vm, JSObject* processObject)
+{
+    return JSC::jsString(vm, makeString("v", REPORTED_NODE_VERSION));
+}
+
+static JSValue constructIsBun(VM& vm, JSObject* processObject)
+{
+    return jsBoolean(true);
+}
+
+static JSValue constructRevision(VM& vm, JSObject* processObject)
+{
+    return JSC::jsString(vm, makeAtomString(Bun__version_sha));
+}
+
+static JSValue constructEnv(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = jsCast<Zig::GlobalObject*>(processObject->globalObject());
+    return globalObject->processEnvObject();
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetuid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(getuid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongeteuid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(geteuid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetegid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(getegid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetgid, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    return JSValue::encode(jsNumber(getgid()));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functiongetgroups, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    int ngroups = getgroups(0, nullptr);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    if (ngroups == -1) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("getgroups"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
+
+    gid_t egid = getegid();
+    JSArray* groups = constructEmptyArray(globalObject, nullptr, static_cast<unsigned int>(ngroups));
+    Vector<gid_t> groupVector(ngroups);
+    getgroups(1, &egid);
+    bool needsEgid = true;
+    for (unsigned i = 0; i < ngroups; i++) {
+        auto current = groupVector[i];
+        if (current == needsEgid) {
+            needsEgid = false;
+        }
+
+        groups->putDirectIndex(globalObject, i, jsNumber(current));
+    }
+
+    if (needsEgid)
+        groups->push(globalObject, jsNumber(egid));
+
+    return JSValue::encode(groups);
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionAssert, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    JSValue arg0 = callFrame->argument(0);
+    bool condition = arg0.toBoolean(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+    if (condition) {
+        return JSValue::encode(jsUndefined());
+    }
+
+    JSValue arg1 = callFrame->argument(1);
+    String message = arg1.isUndefined() ? String() : arg1.toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+    auto error = createError(globalObject, makeString("Assertion failed: "_s, message));
+    error->putDirect(vm, Identifier::fromString(vm, "code"_s), jsString(vm, makeString("ERR_ASSERTION"_s)));
+    throwException(globalObject, throwScope, error);
+    return JSValue::encode(jsUndefined());
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionReallyExit, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    uint8_t exitCode = 0;
+    JSValue arg0 = callFrame->argument(0);
+    if (arg0.isNumber()) {
+        if (!arg0.isInt32()) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        int extiCode32 = arg0.toInt32(globalObject);
+        RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
+
+        if (extiCode32 < 0 || extiCode32 > 127) {
+            throwRangeError(globalObject, throwScope, "The \"code\" argument must be an integer between 0 and 127"_s);
+            return JSC::JSValue::encode(JSC::JSValue {});
+        }
+
+        exitCode = static_cast<uint8_t>(extiCode32);
+    } else if (!arg0.isUndefinedOrNull()) {
+        throwTypeError(globalObject, throwScope, "The \"code\" argument must be an integer"_s);
+        return JSC::JSValue::encode(JSC::JSValue {});
+    } else {
+        exitCode = Bun__getExitCode(Bun__getVM());
+    }
+
+    auto* zigGlobal = jsDynamicCast<Zig::GlobalObject*>(globalObject);
+    if (UNLIKELY(!zigGlobal)) {
+        zigGlobal = Bun__getDefaultGlobal();
+    }
+    Bun__Process__exit(zigGlobal, exitCode);
+    __builtin_unreachable();
+}
+
+template<typename Visitor>
+void Process::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    Process* thisObject = jsCast<Process*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    Base::visitChildren(thisObject, visitor);
+    thisObject->cpuUsageStructure.visit(visitor);
+    thisObject->memoryUsageStructure.visit(visitor);
+}
+
+DEFINE_VISIT_CHILDREN(Process);
+
+static Structure* constructCPUUsageStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
+{
+    JSC::Structure* structure = globalObject->structureCache().emptyObjectStructureForPrototype(globalObject, globalObject->objectPrototype(), 2);
+    PropertyOffset offset;
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "user"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "system"_s),
+        0,
+        offset);
+    return structure;
+}
+static Structure* constructMemoryUsageStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
+{
+    JSC::Structure* structure = globalObject->structureCache().emptyObjectStructureForPrototype(globalObject, globalObject->objectPrototype(), 5);
+    PropertyOffset offset;
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "rss"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "heapTotal"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "heapUsed"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "external"_s),
+        0,
+        offset);
+    structure = structure->addPropertyTransition(
+        vm,
+        structure,
+        JSC::Identifier::fromString(vm, "arrayBuffers"_s),
+        0,
+        offset);
+
+    return structure;
+}
+
+static Process* getProcessObject(JSC::JSGlobalObject* lexicalGlobalObject, JSValue thisValue)
+{
+    Process* process = jsDynamicCast<Process*>(thisValue);
+
+    // Handle "var memoryUsage = process.memoryUsage; memoryUsage()"
+    if (UNLIKELY(!process)) {
+        // Handle calling this function from inside a node:vm
+        Zig::GlobalObject* zigGlobalObject = jsDynamicCast<Zig::GlobalObject*>(lexicalGlobalObject);
+
+        if (UNLIKELY(!zigGlobalObject)) {
+            zigGlobalObject = Bun__getDefaultGlobal();
+        }
+
+        return jsCast<Process*>(zigGlobalObject->processObject());
+    }
+
+    return process;
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionCpuUsage,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    struct rusage rusage;
+    if (getrusage(RUSAGE_SELF, &rusage) != 0) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("getrusage"_s);
+        error.message = Bun::toString("Failed to get CPU usage"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
 
-    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
-    JSC::JSString* jsString = JSC::jsDynamicCast<JSC::JSString*>(JSValue::decode(value));
-    if (!thisObject || !jsString) {
-        return false;
+    auto* process = getProcessObject(globalObject, callFrame->thisValue());
+
+    Structure* cpuUsageStructure = process->cpuUsageStructure.getInitializedOnMainThread(process);
+
+    constexpr double MICROS_PER_SEC = 1000000.0;
+
+    double user = MICROS_PER_SEC * rusage.ru_utime.tv_sec + rusage.ru_utime.tv_usec;
+    double system = MICROS_PER_SEC * rusage.ru_stime.tv_sec + rusage.ru_stime.tv_usec;
+
+    if (callFrame->argumentCount() > 0) {
+        JSValue comparatorValue = callFrame->argument(0);
+        if (!comparatorValue.isUndefined()) {
+            if (UNLIKELY(!comparatorValue.isObject())) {
+                throwTypeError(globalObject, throwScope, "Expected an object as the first argument"_s);
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            JSC::JSObject* comparator = comparatorValue.getObject();
+            JSValue userValue;
+            JSValue systemValue;
+
+            if (LIKELY(comparator->structureID() == cpuUsageStructure->id())) {
+                userValue = comparator->getDirect(0);
+                systemValue = comparator->getDirect(1);
+            } else {
+                userValue = comparator->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "user"_s));
+                RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::jsUndefined()));
+
+                systemValue = comparator->getIfPropertyExists(globalObject, JSC::Identifier::fromString(vm, "system"_s));
+                RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::jsUndefined()));
+            }
+
+            if (UNLIKELY(!userValue || !userValue.isNumber())) {
+                throwTypeError(globalObject, throwScope, "Expected a number for the user property"_s);
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            if (UNLIKELY(!systemValue || !systemValue.isNumber())) {
+                throwTypeError(globalObject, throwScope, "Expected a number for the system property"_s);
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            double userComparator = userValue.asNumber();
+            double systemComparator = systemValue.asNumber();
+
+            user -= userComparator;
+            system -= systemComparator;
+        }
     }
 
-    ZigString str = Zig::toZigString(jsString, globalObject);
-    Bun__Process__setTitle(globalObject, &str);
+    JSC::JSObject* result = JSC::constructEmptyObject(vm, cpuUsageStructure);
+    RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::jsUndefined()));
 
-    return true;
+    result->putDirectOffset(vm, 0, JSC::jsNumber(user));
+    result->putDirectOffset(vm, 1, JSC::jsNumber(system));
+
+    RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(result));
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getArgv, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+static int getRSS(size_t* rss)
+{
+#if defined(__APPLE__)
+    mach_msg_type_number_t count;
+    task_basic_info_data_t info;
+    kern_return_t err;
+
+    count = TASK_BASIC_INFO_COUNT;
+    err = task_info(mach_task_self(),
+        TASK_BASIC_INFO,
+        reinterpret_cast<task_info_t>(&info),
+        &count);
+
+    if (err == KERN_SUCCESS) {
+        *rss = (size_t)info.resident_size;
+        return 0;
+    }
+
+    return -1;
+#elif defined(__linux__)
+    // Taken from libuv.
+    char buf[1024];
+    const char* s;
+    ssize_t n;
+    long val;
+    int fd;
+    int i;
+
+    do
+        fd = open("/proc/self/stat", O_RDONLY);
+    while (fd == -1 && errno == EINTR);
+
+    if (fd == -1)
+        return errno;
+
+    do
+        n = read(fd, buf, sizeof(buf) - 1);
+    while (n == -1 && errno == EINTR);
+
+    int closeErrno = 0;
+    do {
+        closeErrno = close(fd);
+    } while (closeErrno == -1 && errno == EINTR);
+
+    if (n == -1)
+        return errno;
+    buf[n] = '\0';
+
+    s = strchr(buf, ' ');
+    if (s == NULL)
+        goto err;
+
+    s += 1;
+    if (*s != '(')
+        goto err;
+
+    s = strchr(s, ')');
+    if (s == NULL)
+        goto err;
+
+    for (i = 1; i <= 22; i++) {
+        s = strchr(s + 1, ' ');
+        if (s == NULL)
+            goto err;
+    }
+
+    errno = 0;
+    val = strtol(s, NULL, 10);
+    if (errno != 0)
+        goto err;
+    if (val < 0)
+        goto err;
+
+    *rss = val * getpagesize();
+    return 0;
+
+err:
+    return EINVAL;
+#else
+#error "Unsupported platform"
+#endif
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionMemoryUsage,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    auto* process = getProcessObject(globalObject, callFrame->thisValue());
+
+    size_t current_rss = 0;
+    if (getRSS(&current_rss) != 0) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("memoryUsage"_s);
+        error.message = Bun::toString("Failed to get memory usage"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSC::JSValue::encode(JSC::JSValue {});
+    }
 
-    Zig::Process* thisObject = JSC::jsDynamicCast<Zig::Process*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return JSValue::encode(JSC::jsUndefined());
+    JSC::JSObject* result = JSC::constructEmptyObject(vm, process->memoryUsageStructure.getInitializedOnMainThread(process));
+    if (UNLIKELY(throwScope.exception())) {
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    JSC::EncodedJSValue argv_ = Bun__Process__getArgv(globalObject);
-    auto clientData = WebCore::clientData(vm);
+    // Node.js:
+    // {
+    //    rss: 4935680,
+    //    heapTotal: 1826816,
+    //    heapUsed: 650472,
+    //    external: 49879,
+    //    arrayBuffers: 9386
+    // }
 
-    thisObject->putDirect(vm, clientData->builtinNames().argvPublicName(),
-        JSC::JSValue::decode(argv_), 0);
+    result->putDirectOffset(vm, 0, JSC::jsNumber(current_rss));
+    result->putDirectOffset(vm, 1, JSC::jsNumber(vm.heap.blockBytesAllocated()));
 
-    return argv_;
+    // heap.size() loops through every cell...
+    // TODO: add a binding for heap.sizeAfterLastCollection()
+    result->putDirectOffset(vm, 2, JSC::jsNumber(vm.heap.sizeAfterLastEdenCollection()));
+
+    result->putDirectOffset(vm, 3, JSC::jsNumber(vm.heap.externalMemorySize()));
+
+    // We report 0 for this because m_arrayBuffers in JSC::Heap is private and we need to add a binding
+    // If we use objectTypeCounts(), it's hideously slow because it loops through every single object in the heap
+    // TODO: add a binding for m_arrayBuffers, registerWrapper() in TypedArrayController doesn't work
+    result->putDirectOffset(vm, 4, JSC::jsNumber(0));
+
+    RELEASE_AND_RETURN(throwScope, JSC::JSValue::encode(result));
 }
 
-JSC_DEFINE_CUSTOM_SETTER(Process_setArgv,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::EncodedJSValue value, JSC::PropertyName))
+JSC_DEFINE_HOST_FUNCTION(Process_functionMemoryUsageRSS,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
 
-    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return false;
+    size_t current_rss = 0;
+    if (getRSS(&current_rss) != 0) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("memoryUsage"_s);
+        error.message = Bun::toString("Failed to get memory usage"_s);
+        throwException(globalObject, throwScope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    auto clientData = WebCore::clientData(vm);
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsNumber(current_rss)));
+}
+
+JSC_DEFINE_HOST_FUNCTION(Process_functionOpenStdin, (JSGlobalObject * globalObject, CallFrame* callFrame))
+{
+    auto& vm = globalObject->vm();
+    Zig::GlobalObject* global = jsDynamicCast<Zig::GlobalObject*>(globalObject);
+    if (UNLIKELY(!global)) {
+        global = Bun__getDefaultGlobal();
+    }
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+
+    if (JSValue stdin = global->processObject()->getIfPropertyExists(globalObject, Identifier::fromString(vm, "stdin"_s))) {
+        RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+
+        if (!stdin.isObject()) {
+            throwTypeError(globalObject, throwScope, "stdin is not an object"_s);
+            return JSValue::encode(jsUndefined());
+        }
+
+        JSValue resumeValue = stdin.getObject()->getIfPropertyExists(globalObject, Identifier::fromString(vm, "resume"_s));
+        RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+        if (!resumeValue.isUndefinedOrNull()) {
+            auto resumeFunction = jsDynamicCast<JSFunction*>(resumeValue);
+            if (UNLIKELY(!resumeFunction)) {
+                throwTypeError(globalObject, throwScope, "stdin.resume is not a function"_s);
+                return JSValue::encode(jsUndefined());
+            }
+
+            auto callData = getCallData(resumeFunction);
+
+            MarkedArgumentBuffer args;
+            JSC::call(globalObject, resumeFunction, callData, stdin, args);
+            RETURN_IF_EXCEPTION(throwScope, JSValue::encode(jsUndefined()));
+        }
+
+        RELEASE_AND_RETURN(throwScope, JSValue::encode(stdin));
+    }
 
-    return thisObject->putDirect(vm, clientData->builtinNames().argvPublicName(),
-        JSC::JSValue::decode(value));
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsUndefined()));
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getPID, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+JSC_DEFINE_HOST_FUNCTION(Process_stubEmptyFunction, (JSGlobalObject * globalObject, CallFrame* callFrame))
 {
-    return JSC::JSValue::encode(JSC::JSValue(getpid()));
+    return JSValue::encode(jsUndefined());
 }
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getPPID, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+JSC_DEFINE_HOST_FUNCTION(Process_stubFunctionReturningArray, (JSGlobalObject * globalObject, CallFrame* callFrame))
 {
-    return JSC::JSValue::encode(JSC::JSValue(getppid()));
+    return JSValue::encode(JSC::constructEmptyArray(globalObject, nullptr));
 }
 
-#if !defined(BUN_WEBKIT_VERSION)
-#define BUN_WEBKIT_VERSION "unknown"
-#endif
+static JSValue Process_stubEmptyObject(VM& vm, JSObject* processObject)
+{
+    return JSC::constructEmptyObject(processObject->globalObject());
+}
 
-JSC_DEFINE_CUSTOM_GETTER(Process_getVersionsLazy,
-    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
-        JSC::PropertyName))
+static JSValue Process_stubEmptyArray(VM& vm, JSObject* processObject)
 {
-    JSC::VM& vm = globalObject->vm();
-    auto clientData = WebCore::clientData(vm);
+    return JSC::constructEmptyArray(processObject->globalObject(), nullptr);
+}
 
-    Zig::Process* thisObject = JSC::jsDynamicCast<Zig::Process*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return JSValue::encode(JSC::jsUndefined());
+static JSValue Process_stubEmptySet(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    return JSSet::create(vm, globalObject->setStructure());
+}
+
+static JSValue constructMemoryUsage(VM& vm, JSObject* processObject)
+{
+    auto* globalObject = processObject->globalObject();
+    JSC::JSFunction* memoryUsage = JSC::JSFunction::create(vm, globalObject, 0,
+        String("memoryUsage"_s), Process_functionMemoryUsage, ImplementationVisibility::Public);
+
+    JSC::JSFunction* rss = JSC::JSFunction::create(vm, globalObject, 0,
+        String("rss"_s), Process_functionMemoryUsageRSS, ImplementationVisibility::Public);
+
+    memoryUsage->putDirect(vm, JSC::Identifier::fromString(vm, "rss"_s), rss, JSC::PropertyAttribute::Function | 0);
+    return memoryUsage;
+}
+
+static JSValue constructFeatures(VM& vm, JSObject* processObject)
+{
+    // {
+    //     inspector: true,
+    //     debug: false,
+    //     uv: true,
+    //     ipv6: true,
+    //     tls_alpn: true,
+    //     tls_sni: true,
+    //     tls_ocsp: true,
+    //     tls: true,
+    //     cached_builtins: [Getter]
+    // }
+    auto* globalObject = processObject->globalObject();
+    auto* object = constructEmptyObject(globalObject);
+
+    object->putDirect(vm, Identifier::fromString(vm, "inspector"_s), jsBoolean(true));
+#ifdef BUN_DEBUG
+    object->putDirect(vm, Identifier::fromString(vm, "debug"_s), jsBoolean(true));
+#else
+    object->putDirect(vm, Identifier::fromString(vm, "debug"_s), jsBoolean(false));
+#endif
+    // lying
+    object->putDirect(vm, Identifier::fromString(vm, "uv"_s), jsBoolean(true));
+
+    object->putDirect(vm, Identifier::fromString(vm, "ipv6"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls_alpn"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls_sni"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls_ocsp"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "tls"_s), jsBoolean(true));
+    object->putDirect(vm, Identifier::fromString(vm, "cached_builtins"_s), jsBoolean(true));
+
+    return object;
+}
+
+static int _debugPort;
+
+JSC_DEFINE_CUSTOM_GETTER(processDebugPort, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    if (_debugPort == 0) {
+        _debugPort = 9229;
     }
-    auto scope = DECLARE_THROW_SCOPE(vm);
 
-    JSC::JSObject* object = JSC::constructEmptyObject(globalObject, globalObject->objectPrototype(), 19);
+    return JSC::JSValue::encode(jsNumber(_debugPort));
+}
 
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "node"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(REPORTED_NODE_VERSION))));
-    object->putDirect(
-        vm, JSC::Identifier::fromString(vm, "bun"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(Bun__version + 1 /* prefix with v */))));
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString(BUN_WEBKIT_VERSION))));
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "boringssl"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_boringssl))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "libarchive"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_libarchive))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "mimalloc"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_mimalloc))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "picohttpparser"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_picohttpparser))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "uwebsockets"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_uws))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "webkit"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_webkit))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "zig"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zig))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "zlib"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_zlib))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "tinycc"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_tinycc))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "lolhtml"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_lolhtml))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "ares"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_c_ares))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "usockets"_s),
-        JSC::JSValue(JSC::jsString(vm, makeString(Bun__versions_usockets))), 0);
+JSC_DEFINE_CUSTOM_SETTER(setProcessDebugPort,
+    (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
+        JSC::EncodedJSValue encodedValue, JSC::PropertyName))
+{
+    auto& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+    JSValue value = JSValue::decode(encodedValue);
 
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "v8"_s), JSValue(JSC::jsString(vm, makeString("10.8.168.20-node.8"_s))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "uv"_s), JSValue(JSC::jsString(vm, makeString("1.44.2"_s))), 0);
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "napi"_s), JSValue(JSC::jsString(vm, makeString("8"_s))), 0);
+    if (!value.isInt32()) {
+        throwRangeError(globalObject, scope, "debugPort must be 0 or in range 1024 to 65535"_s);
+        return false;
+    }
 
-    object->putDirect(vm, JSC::Identifier::fromString(vm, "modules"_s),
-        JSC::JSValue(JSC::jsOwnedString(vm, makeAtomString("108"))));
+    int port = value.asInt32();
 
-    thisObject->putDirect(vm, clientData->builtinNames().versionsPublicName(), object, 0);
+    if (port != 0) {
+        if (port < 1024 || port > 65535) {
+            throwRangeError(globalObject, scope, "debugPort must be 0 or in range 1024 to 65535"_s);
+            return false;
+        }
+    }
 
-    RETURN_IF_EXCEPTION(scope, {});
+    _debugPort = port;
+    return true;
+}
 
-    return JSValue::encode(object);
+JSC_DEFINE_CUSTOM_GETTER(processTitle, (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue, JSC::PropertyName))
+{
+    ZigString str;
+    Bun__Process__getTitle(globalObject, &str);
+    return JSValue::encode(Zig::toJSStringValue(str, globalObject));
 }
-JSC_DEFINE_CUSTOM_SETTER(Process_setVersionsLazy,
+
+JSC_DEFINE_CUSTOM_SETTER(setProcessTitle,
     (JSC::JSGlobalObject * globalObject, JSC::EncodedJSValue thisValue,
         JSC::EncodedJSValue value, JSC::PropertyName))
 {
     JSC::VM& vm = globalObject->vm();
-    auto clientData = WebCore::clientData(vm);
 
-    Zig::Process* thisObject = JSC::jsDynamicCast<Zig::Process*>(JSValue::decode(thisValue));
-    if (!thisObject) {
-        return JSValue::encode(JSC::jsUndefined());
+    JSC::JSObject* thisObject = JSC::jsDynamicCast<JSC::JSObject*>(JSValue::decode(thisValue));
+    JSC::JSString* jsString = JSC::jsDynamicCast<JSC::JSString*>(JSValue::decode(value));
+    if (!thisObject || !jsString) {
+        return false;
     }
 
-    thisObject->putDirect(vm, clientData->builtinNames().versionsPublicName(),
-        JSC::JSValue::decode(value), 0);
+    ZigString str = Zig::toZigString(jsString, globalObject);
+    Bun__Process__setTitle(globalObject, &str);
 
     return true;
 }
 
-static JSC_DEFINE_HOST_FUNCTION(Process_functionCwd,
+JSC_DEFINE_HOST_FUNCTION(Process_functionCwd,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
@@ -958,4 +1606,162 @@ static JSC_DEFINE_HOST_FUNCTION(Process_functionCwd,
     return JSC::JSValue::encode(result);
 }
 
+JSC_DEFINE_HOST_FUNCTION(Process_functionReallyKill,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+
+    int pid = callFrame->argument(0).toInt32(globalObject);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    int signal = callFrame->argument(1).toInt32(globalObject);
+    RETURN_IF_EXCEPTION(scope, {});
+
+    int result = kill(pid, signal);
+    if (result == -1) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("kill"_s);
+        throwException(globalObject, scope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
+
+    return JSValue::encode(jsUndefined());
+}
+JSC_DEFINE_HOST_FUNCTION(Process_functionKill,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
+
+    int pid = callFrame->argument(0).toInt32(globalObject);
+    RETURN_IF_EXCEPTION(scope, {});
+    if (pid < 0) {
+        throwRangeError(globalObject, scope, "pid must be a positive integer"_s);
+        return JSValue::encode(jsUndefined());
+    }
+
+    JSC::JSValue signalValue = callFrame->argument(1);
+
+    int signal = SIGTERM;
+
+    if (signalValue.isNumber()) {
+        signal = signalValue.toInt32(globalObject);
+        RETURN_IF_EXCEPTION(scope, {});
+    } else if (signalValue.isString()) {
+        loadSignalNumberMap();
+        if (auto num = signalNameToNumberMap->get(signalValue.toWTFString(globalObject))) {
+            signal = num;
+            RETURN_IF_EXCEPTION(scope, {});
+        } else {
+            throwRangeError(globalObject, scope, "Unknown signal name"_s);
+            return JSValue::encode(jsUndefined());
+        }
+
+        RETURN_IF_EXCEPTION(scope, {});
+    } else if (!signalValue.isUndefinedOrNull()) {
+        throwTypeError(globalObject, scope, "signal must be a string or number"_s);
+        return JSValue::encode(jsUndefined());
+    }
+
+    int result = kill(pid, signal);
+
+    if (result == -1) {
+        SystemError error;
+        error.errno_ = errno;
+        error.syscall = Bun::toString("kill"_s);
+        throwException(globalObject, scope, JSValue::decode(SystemError__toErrorInstance(&error, globalObject)));
+        return JSValue::encode(jsUndefined());
+    }
+
+    return JSValue::encode(jsUndefined());
+}
+
+/* Source for Process.lut.h
+@begin processObjectTable
+  abort                            Process_functionAbort                    Function 1
+  allowedNodeEnvironmentFlags      Process_stubEmptySet                     PropertyCallback
+  arch                             constructArch                            PropertyCallback
+  argv                             constructArgv                            PropertyCallback
+  argv0                            constructArgv0                           PropertyCallback
+  assert                           Process_functionAssert                   Function 1
+  binding                          JSBuiltin                                Function 1
+  browser                          constructBrowser                         PropertyCallback
+  chdir                            Process_functionChdir                    Function 1
+  config                           constructProcessConfigObject             PropertyCallback
+  cpuUsage                         Process_functionCpuUsage                 Function 1
+  cwd                              Process_functionCwd                      Function 1
+  debugPort                        processDebugPort                         CustomAccessor
+  dlopen                           Process_functionDlopen                   Function 1
+  emitWarning                      Process_emitWarning                      Function 1
+  env                              constructEnv                             PropertyCallback
+  execArgv                         constructExecArgv                        PropertyCallback
+  execPath                         constructExecPath                        PropertyCallback
+  exit                             Process_functionExit                     Function 1
+  exitCode                         processExitCode                          CustomAccessor
+  features                         constructFeatures                        PropertyCallback
+  getActiveResourcesInfo           Process_stubFunctionReturningArray       Function 0
+  getegid                          Process_functiongetegid                  Function 0
+  geteuid                          Process_functiongeteuid                  Function 0
+  getgid                           Process_functiongetgid                   Function 0
+  getgroups                        Process_functiongetgroups                Function 0
+  getuid                           Process_functiongetuid                   Function 0
+  hrtime                           constructProcessHrtimeObject             PropertyCallback
+  isBun                            constructIsBun                           PropertyCallback
+  kill                             Process_functionKill                     Function 2
+  mainModule                       JSBuiltin                                ReadOnly|Builtin|Accessor|Function 0
+  memoryUsage                      constructMemoryUsage                     PropertyCallback
+  moduleLoadList                   Process_stubEmptyArray                   PropertyCallback
+  nextTick                         Process_functionNextTick                 Function 1
+  openStdin                        Process_functionOpenStdin                Function 0
+  pid                              constructPid                             PropertyCallback
+  platform                         constructPlatform                        PropertyCallback
+  ppid                             constructPpid                            PropertyCallback
+  reallyExit                       Process_functionReallyExit               Function 1
+  release                          constructProcessReleaseObject            PropertyCallback
+  revision                         constructRevision                        PropertyCallback
+  setSourceMapsEnabled             Process_stubEmptyFunction                Function 1
+  stderr                           constructStderr                          PropertyCallback
+  stdin                            constructStdin                           PropertyCallback
+  stdout                           constructStdout                          PropertyCallback
+  title                            processTitle                             CustomAccessor
+  umask                            Process_functionUmask                    Function 1
+  uptime                           Process_functionUptime                   Function 1
+  version                          constructVersion                         PropertyCallback
+  versions                         constructVersions                        PropertyCallback
+  _debugEnd                        Process_stubEmptyFunction                Function 0
+  _debugProcess                    Process_stubEmptyFunction                Function 0
+  _fatalException                  Process_stubEmptyFunction                Function 1
+  _getActiveRequests               Process_stubFunctionReturningArray       Function 0
+  _getActiveHandles                Process_stubFunctionReturningArray       Function 0
+  _linkedBinding                   Process_stubEmptyFunction                Function 0
+  _preload_modules                 Process_stubEmptyObject                  PropertyCallback
+  _rawDebug                        Process_stubEmptyFunction                Function 0
+  _startProfilerIdleNotifier       Process_stubEmptyFunction                Function 0
+  _stopProfilerIdleNotifier        Process_stubEmptyFunction                Function 0
+  _tickCallback                    Process_stubEmptyFunction                Function 0
+  _kill                            Process_functionReallyKill               Function 2
+@end
+*/
+
+#include "Process.lut.h"
+const JSC::ClassInfo Process::s_info = { "Process"_s, &Base::s_info, &processObjectTable, nullptr,
+    CREATE_METHOD_TABLE(Process) };
+
+void Process::finishCreation(JSC::VM& vm)
+{
+    Base::finishCreation(vm);
+
+    this->wrapped().onDidChangeListener = &onDidChangeListeners;
+
+    this->cpuUsageStructure.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::Structure>::Initializer& init) {
+        init.set(constructCPUUsageStructure(init.vm, init.owner->globalObject()));
+    });
+
+    this->memoryUsageStructure.initLater([](const JSC::LazyProperty<JSC::JSObject, JSC::Structure>::Initializer& init) {
+        init.set(constructMemoryUsageStructure(init.vm, init.owner->globalObject()));
+    });
+
+    this->putDirect(vm, vm.propertyNames->toStringTagSymbol, jsString(vm, String("process"_s)), 0);
+}
+
 } // namespace Zig
diff --git a/src/bun.js/bindings/Process.h b/src/bun.js/bindings/Process.h
index 322b39078..0ee6f4243 100644
--- a/src/bun.js/bindings/Process.h
+++ b/src/bun.js/bindings/Process.h
@@ -19,6 +19,8 @@ public:
     {
     }
 
+    void emitSignalEvent(int signalNumber);
+
     DECLARE_EXPORT_INFO;
 
     static void destroy(JSC::JSCell* cell)
@@ -28,7 +30,7 @@ public:
 
     ~Process();
 
-    static constexpr unsigned StructureFlags = Base::StructureFlags;
+    static constexpr unsigned StructureFlags = Base::StructureFlags | HasStaticPropertyTable;
 
     static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject,
         JSC::JSValue prototype)
@@ -45,6 +47,24 @@ public:
         return accessor;
     }
 
+    LazyProperty<JSObject, Structure> cpuUsageStructure;
+    LazyProperty<JSObject, Structure> memoryUsageStructure;
+
+    DECLARE_VISIT_CHILDREN;
+
+    template<typename, SubspaceAccess mode>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+            return nullptr;
+        return WebCore::subspaceForImpl<Process, WebCore::UseCustomHeapCellType::No>(
+            vm,
+            [](auto& spaces) { return spaces.m_clientSubspaceForProcessObject.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForProcessObject = std::forward<decltype(space)>(space); },
+            [](auto& spaces) { return spaces.m_subspaceForProcessObject.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_subspaceForProcessObject = std::forward<decltype(space)>(space); });
+    }
+
     void finishCreation(JSC::VM& vm);
 };
 
diff --git a/src/bun.js/bindings/Process.lut.h b/src/bun.js/bindings/Process.lut.h
new file mode 100644
index 000000000..81cf98c7d
--- /dev/null
+++ b/src/bun.js/bindings/Process.lut.h
@@ -0,0 +1,214 @@
+// File generated via `make generate-builtins`
+static const struct CompactHashIndex processObjectTableIndex[143] = {
+    { 44, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 15, 129 },
+    { -1, -1 },
+    { -1, -1 },
+    { 18, 139 },
+    { -1, -1 },
+    { 46, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 3, 142 },
+    { 1, 128 },
+    { -1, -1 },
+    { 60, -1 },
+    { -1, -1 },
+    { 10, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 32, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 53, -1 },
+    { 27, -1 },
+    { 12, -1 },
+    { -1, -1 },
+    { 19, -1 },
+    { -1, -1 },
+    { 14, 138 },
+    { -1, -1 },
+    { 37, -1 },
+    { -1, -1 },
+    { 39, -1 },
+    { 56, -1 },
+    { 36, -1 },
+    { 6, 140 },
+    { -1, -1 },
+    { 52, -1 },
+    { 4, -1 },
+    { 48, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 2, -1 },
+    { 7, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 41, -1 },
+    { -1, -1 },
+    { 29, 133 },
+    { -1, -1 },
+    { 0, -1 },
+    { 26, 136 },
+    { 16, 130 },
+    { 40, -1 },
+    { -1, -1 },
+    { 23, -1 },
+    { 11, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 59, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 31, 137 },
+    { -1, -1 },
+    { 30, -1 },
+    { 22, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 24, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 20, -1 },
+    { -1, -1 },
+    { 5, -1 },
+    { -1, -1 },
+    { 61, -1 },
+    { 49, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 13, 131 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 9, -1 },
+    { 25, 134 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 21, 135 },
+    { -1, -1 },
+    { -1, -1 },
+    { -1, -1 },
+    { 47, 141 },
+    { -1, -1 },
+    { 17, -1 },
+    { 8, -1 },
+    { 28, -1 },
+    { 33, 132 },
+    { 34, -1 },
+    { 35, -1 },
+    { 38, -1 },
+    { 42, -1 },
+    { 43, -1 },
+    { 45, -1 },
+    { 50, -1 },
+    { 51, -1 },
+    { 54, -1 },
+    { 55, -1 },
+    { 57, -1 },
+    { 58, -1 },
+};
+
+static const struct HashTableValue processObjectTableValues[62] = {
+   { "abort"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionAbort, 1 } },
+   { "allowedNodeEnvironmentFlags"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, Process_stubEmptySet } },
+   { "arch"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructArch } },
+   { "argv"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructArgv } },
+   { "argv0"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructArgv0 } },
+   { "assert"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionAssert, 1 } },
+   { "binding"_s, ((static_cast<unsigned>(PropertyAttribute::Function)) & ~PropertyAttribute::Function) | PropertyAttribute::Builtin, NoIntrinsic, { HashTableValue::BuiltinGeneratorType, processObjectBindingCodeGenerator, 1 } },
+   { "browser"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructBrowser } },
+   { "chdir"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionChdir, 1 } },
+   { "config"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructProcessConfigObject } },
+   { "cpuUsage"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionCpuUsage, 1 } },
+   { "cwd"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionCwd, 1 } },
+   { "debugPort"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, processDebugPort, setProcessDebugPort } },
+   { "dlopen"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionDlopen, 1 } },
+   { "emitWarning"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_emitWarning, 1 } },
+   { "env"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructEnv } },
+   { "execArgv"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructExecArgv } },
+   { "execPath"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructExecPath } },
+   { "exit"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionExit, 1 } },
+   { "exitCode"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, processExitCode, setProcessExitCode } },
+   { "features"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructFeatures } },
+   { "getActiveResourcesInfo"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubFunctionReturningArray, 0 } },
+   { "getegid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetegid, 0 } },
+   { "geteuid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongeteuid, 0 } },
+   { "getgid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetgid, 0 } },
+   { "getgroups"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetgroups, 0 } },
+   { "getuid"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functiongetuid, 0 } },
+   { "hrtime"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructProcessHrtimeObject } },
+   { "isBun"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructIsBun } },
+   { "kill"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionKill, 2 } },
+   { "mainModule"_s, ((static_cast<unsigned>(PropertyAttribute::ReadOnly|PropertyAttribute::Builtin|PropertyAttribute::Accessor|PropertyAttribute::Function)) & ~PropertyAttribute::Function) | PropertyAttribute::Builtin, NoIntrinsic, { HashTableValue::BuiltinGeneratorType, processObjectMainModuleCodeGenerator, 0 } },
+   { "memoryUsage"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructMemoryUsage } },
+   { "moduleLoadList"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, Process_stubEmptyArray } },
+   { "nextTick"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionNextTick, 1 } },
+   { "openStdin"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionOpenStdin, 0 } },
+   { "pid"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructPid } },
+   { "platform"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructPlatform } },
+   { "ppid"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructPpid } },
+   { "reallyExit"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionReallyExit, 1 } },
+   { "release"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructProcessReleaseObject } },
+   { "revision"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructRevision } },
+   { "setSourceMapsEnabled"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 1 } },
+   { "stderr"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructStderr } },
+   { "stdin"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructStdin } },
+   { "stdout"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructStdout } },
+   { "title"_s, static_cast<unsigned>(PropertyAttribute::CustomAccessor), NoIntrinsic, { HashTableValue::GetterSetterType, processTitle, setProcessTitle } },
+   { "umask"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionUmask, 1 } },
+   { "uptime"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionUptime, 1 } },
+   { "version"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructVersion } },
+   { "versions"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, constructVersions } },
+   { "_debugEnd"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_debugProcess"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_fatalException"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 1 } },
+   { "_getActiveRequests"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubFunctionReturningArray, 0 } },
+   { "_getActiveHandles"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubFunctionReturningArray, 0 } },
+   { "_linkedBinding"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_preload_modules"_s, static_cast<unsigned>(PropertyAttribute::PropertyCallback), NoIntrinsic, { HashTableValue::LazyPropertyType, Process_stubEmptyObject } },
+   { "_rawDebug"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_startProfilerIdleNotifier"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_stopProfilerIdleNotifier"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_tickCallback"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_stubEmptyFunction, 0 } },
+   { "_kill"_s, static_cast<unsigned>(PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, Process_functionReallyKill, 2 } },
+};
+
+static const struct HashTable processObjectTable =
+    { 62, 127, true, nullptr, processObjectTableValues, processObjectTableIndex };
diff --git a/src/bun.js/bindings/ScriptExecutionContext.cpp b/src/bun.js/bindings/ScriptExecutionContext.cpp
index e8cae5e33..3262bdb5d 100644
--- a/src/bun.js/bindings/ScriptExecutionContext.cpp
+++ b/src/bun.js/bindings/ScriptExecutionContext.cpp
@@ -20,6 +20,12 @@ static HashMap<ScriptExecutionContextIdentifier, ScriptExecutionContext*>& allSc
     return contexts;
 }
 
+ScriptExecutionContext* ScriptExecutionContext::getScriptExecutionContext(ScriptExecutionContextIdentifier identifier)
+{
+    Locker locker { allScriptExecutionContextsMapLock };
+    return allScriptExecutionContextsMap().get(identifier);
+}
+
 template<bool SSL, bool isServer>
 static void registerHTTPContextForWebSocket(ScriptExecutionContext* script, us_socket_context_t* ctx, us_loop_t* loop)
 {
diff --git a/src/bun.js/bindings/ScriptExecutionContext.h b/src/bun.js/bindings/ScriptExecutionContext.h
index 5f6c56a90..aed7977a5 100644
--- a/src/bun.js/bindings/ScriptExecutionContext.h
+++ b/src/bun.js/bindings/ScriptExecutionContext.h
@@ -96,7 +96,12 @@ public:
         }
     }
 
-    const WTF::URL& url() const { return m_url; }
+    static ScriptExecutionContext* getScriptExecutionContext(ScriptExecutionContextIdentifier identifier);
+
+    const WTF::URL& url() const
+    {
+        return m_url;
+    }
     bool activeDOMObjectsAreSuspended() { return false; }
     bool activeDOMObjectsAreStopped() { return false; }
     bool isContextThread() { return true; }
@@ -141,6 +146,7 @@ public:
         auto* task = new EventLoopTask(WTFMove(lambda));
         postTaskOnTimeout(task, timeout);
     }
+
     template<typename... Arguments>
     void postCrossThreadTask(Arguments&&... arguments)
     {
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h b/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h
index b16febcdb..f0d491c0b 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+DOMClientIsoSubspaces.h
@@ -8,6 +8,7 @@ std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectConstructor;std:
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectAnything;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectStringContaining;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForExpectStringMatching;
+std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForFSWatcher;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForFileSystemRouter;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForFileSystemRouterConstructor;std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForListener;
 std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForMD4;
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h b/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h
index 59263e62c..02a9adbca 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+DOMIsoSubspaces.h
@@ -8,6 +8,7 @@ std::unique_ptr<IsoSubspace> m_subspaceForExpectConstructor;std::unique_ptr<IsoS
 std::unique_ptr<IsoSubspace> m_subspaceForExpectAnything;
 std::unique_ptr<IsoSubspace> m_subspaceForExpectStringContaining;
 std::unique_ptr<IsoSubspace> m_subspaceForExpectStringMatching;
+std::unique_ptr<IsoSubspace> m_subspaceForFSWatcher;
 std::unique_ptr<IsoSubspace> m_subspaceForFileSystemRouter;
 std::unique_ptr<IsoSubspace> m_subspaceForFileSystemRouterConstructor;std::unique_ptr<IsoSubspace> m_subspaceForListener;
 std::unique_ptr<IsoSubspace> m_subspaceForMD4;
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h
index 4471fbab3..ac03032e6 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureHeader.h
@@ -58,6 +58,12 @@ JSC::Structure* JSExpectStringMatchingStructure() { return m_JSExpectStringMatch
   JSC::LazyClassStructure m_JSExpectStringMatching;
   bool hasJSExpectStringMatchingSetterValue { false };
   mutable JSC::WriteBarrier<JSC::Unknown> m_JSExpectStringMatchingSetterValue;
+JSC::Structure* JSFSWatcherStructure() { return m_JSFSWatcher.getInitializedOnMainThread(this); }
+        JSC::JSObject* JSFSWatcherConstructor() { return m_JSFSWatcher.constructorInitializedOnMainThread(this); }
+        JSC::JSValue JSFSWatcherPrototype() { return m_JSFSWatcher.prototypeInitializedOnMainThread(this); }
+  JSC::LazyClassStructure m_JSFSWatcher;
+  bool hasJSFSWatcherSetterValue { false };
+  mutable JSC::WriteBarrier<JSC::Unknown> m_JSFSWatcherSetterValue;
 JSC::Structure* JSFileSystemRouterStructure() { return m_JSFileSystemRouter.getInitializedOnMainThread(this); }
         JSC::JSObject* JSFileSystemRouterConstructor() { return m_JSFileSystemRouter.constructorInitializedOnMainThread(this); }
         JSC::JSValue JSFileSystemRouterPrototype() { return m_JSFileSystemRouter.prototypeInitializedOnMainThread(this); }
diff --git a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h
index 4e5a2c1fa..b3b5327a4 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses+lazyStructureImpl.h
@@ -59,6 +59,12 @@ void GlobalObject::initGeneratedLazyClasses() {
                  init.setStructure(WebCore::JSExpectStringMatching::createStructure(init.vm, init.global, init.prototype));
                  
               });
+    m_JSFSWatcher.initLater(
+              [](LazyClassStructure::Initializer& init) {
+                 init.setPrototype(WebCore::JSFSWatcher::createPrototype(init.vm, reinterpret_cast<Zig::GlobalObject*>(init.global)));
+                 init.setStructure(WebCore::JSFSWatcher::createStructure(init.vm, init.global, init.prototype));
+                 
+              });
     m_JSFileSystemRouter.initLater(
               [](LazyClassStructure::Initializer& init) {
                  init.setPrototype(WebCore::JSFileSystemRouter::createPrototype(init.vm, reinterpret_cast<Zig::GlobalObject*>(init.global)));
@@ -211,6 +217,7 @@ void GlobalObject::visitGeneratedLazyClasses(GlobalObject *thisObject, Visitor&
       thisObject->m_JSExpectAnything.visit(visitor);  visitor.append(thisObject->m_JSExpectAnythingSetterValue);
       thisObject->m_JSExpectStringContaining.visit(visitor);  visitor.append(thisObject->m_JSExpectStringContainingSetterValue);
       thisObject->m_JSExpectStringMatching.visit(visitor);  visitor.append(thisObject->m_JSExpectStringMatchingSetterValue);
+      thisObject->m_JSFSWatcher.visit(visitor);  visitor.append(thisObject->m_JSFSWatcherSetterValue);
       thisObject->m_JSFileSystemRouter.visit(visitor);  visitor.append(thisObject->m_JSFileSystemRouterSetterValue);
       thisObject->m_JSListener.visit(visitor);  visitor.append(thisObject->m_JSListenerSetterValue);
       thisObject->m_JSMD4.visit(visitor);  visitor.append(thisObject->m_JSMD4SetterValue);
diff --git a/src/bun.js/bindings/ZigGeneratedClasses.cpp b/src/bun.js/bindings/ZigGeneratedClasses.cpp
index d51a1959a..b4d672328 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses.cpp
+++ b/src/bun.js/bindings/ZigGeneratedClasses.cpp
@@ -103,6 +103,9 @@ extern "C" void BlobClass__finalize(void*);
 extern "C" EncodedJSValue BlobPrototype__getArrayBuffer(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(BlobPrototype__arrayBufferCallback);
 
+extern "C" EncodedJSValue BlobPrototype__getExists(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(BlobPrototype__existsCallback);
+
 extern "C" EncodedJSValue BlobPrototype__getFormData(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(BlobPrototype__formDataCallback);
 
@@ -137,6 +140,7 @@ STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSBlobPrototype, JSBlobPrototype::Base);
 
 static const HashTableValue JSBlobPrototypeTableValues[] = {
     { "arrayBuffer"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__arrayBufferCallback, 0 } },
+    { "exists"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__existsCallback, 0 } },
     { "formData"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__formDataCallback, 0 } },
     { "json"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, BlobPrototype__jsonCallback, 0 } },
     { "lastModified"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, BlobPrototype__lastModifiedGetterWrap, 0 } },
@@ -190,6 +194,33 @@ JSC_DEFINE_HOST_FUNCTION(BlobPrototype__arrayBufferCallback, (JSGlobalObject * l
     return BlobPrototype__getArrayBuffer(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(BlobPrototype__existsCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSBlob* thisObject = jsDynamicCast<JSBlob*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return BlobPrototype__getExists(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(BlobPrototype__formDataCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -5381,6 +5412,307 @@ void JSExpectStringMatching::visitOutputConstraintsImpl(JSCell* cell, Visitor& v
 }
 
 DEFINE_VISIT_OUTPUT_CONSTRAINTS(JSExpectStringMatching);
+class JSFSWatcherPrototype final : public JSC::JSNonFinalObject {
+public:
+    using Base = JSC::JSNonFinalObject;
+
+    static JSFSWatcherPrototype* create(JSC::VM& vm, JSGlobalObject* globalObject, JSC::Structure* structure)
+    {
+        JSFSWatcherPrototype* ptr = new (NotNull, JSC::allocateCell<JSFSWatcherPrototype>(vm)) JSFSWatcherPrototype(vm, globalObject, structure);
+        ptr->finishCreation(vm, globalObject);
+        return ptr;
+    }
+
+    DECLARE_INFO;
+    template<typename CellType, JSC::SubspaceAccess>
+    static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        return &vm.plainObjectSpace();
+    }
+    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+    {
+        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
+    }
+
+private:
+    JSFSWatcherPrototype(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure)
+        : Base(vm, structure)
+    {
+    }
+
+    void finishCreation(JSC::VM&, JSC::JSGlobalObject*);
+};
+
+extern "C" void FSWatcherClass__finalize(void*);
+
+extern "C" EncodedJSValue FSWatcherPrototype__doClose(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__closeCallback);
+
+extern "C" EncodedJSValue FSWatcherPrototype__hasRef(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__hasRefCallback);
+
+extern "C" EncodedJSValue FSWatcherPrototype__doRef(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__refCallback);
+
+extern "C" EncodedJSValue FSWatcherPrototype__doUnref(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(FSWatcherPrototype__unrefCallback);
+
+STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSFSWatcherPrototype, JSFSWatcherPrototype::Base);
+
+static const HashTableValue JSFSWatcherPrototypeTableValues[] = {
+    { "close"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__closeCallback, 0 } },
+    { "hasRef"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__hasRefCallback, 0 } },
+    { "ref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__refCallback, 0 } },
+    { "unref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, FSWatcherPrototype__unrefCallback, 0 } }
+};
+
+const ClassInfo JSFSWatcherPrototype::s_info = { "FSWatcher"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSFSWatcherPrototype) };
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__closeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__doClose(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__hasRefCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__hasRef(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__refCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__doRef(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+JSC_DEFINE_HOST_FUNCTION(FSWatcherPrototype__unrefCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSFSWatcher* thisObject = jsDynamicCast<JSFSWatcher*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return FSWatcherPrototype__doUnref(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
+extern "C" void FSWatcherPrototype__listenerSetCachedValue(JSC::EncodedJSValue thisValue, JSC::JSGlobalObject* globalObject, JSC::EncodedJSValue value)
+{
+    auto& vm = globalObject->vm();
+    auto* thisObject = jsCast<JSFSWatcher*>(JSValue::decode(thisValue));
+    thisObject->m_listener.set(vm, thisObject, JSValue::decode(value));
+}
+
+extern "C" EncodedJSValue FSWatcherPrototype__listenerGetCachedValue(JSC::EncodedJSValue thisValue)
+{
+    auto* thisObject = jsCast<JSFSWatcher*>(JSValue::decode(thisValue));
+    return JSValue::encode(thisObject->m_listener.get());
+}
+
+void JSFSWatcherPrototype::finishCreation(JSC::VM& vm, JSC::JSGlobalObject* globalObject)
+{
+    Base::finishCreation(vm);
+    reifyStaticProperties(vm, JSFSWatcher::info(), JSFSWatcherPrototypeTableValues, *this);
+    JSC_TO_STRING_TAG_WITHOUT_TRANSITION();
+}
+
+extern "C" bool FSWatcher__hasPendingActivity(void* ptr);
+bool JSFSWatcher::hasPendingActivity(void* ctx)
+{
+    return FSWatcher__hasPendingActivity(ctx);
+}
+
+JSFSWatcher::~JSFSWatcher()
+{
+    if (m_ctx) {
+        FSWatcherClass__finalize(m_ctx);
+    }
+}
+void JSFSWatcher::destroy(JSCell* cell)
+{
+    static_cast<JSFSWatcher*>(cell)->JSFSWatcher::~JSFSWatcher();
+}
+
+const ClassInfo JSFSWatcher::s_info = { "FSWatcher"_s, &Base::s_info, nullptr, nullptr, CREATE_METHOD_TABLE(JSFSWatcher) };
+
+void JSFSWatcher::finishCreation(VM& vm)
+{
+    Base::finishCreation(vm);
+    ASSERT(inherits(info()));
+}
+
+JSFSWatcher* JSFSWatcher::create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, void* ctx)
+{
+    JSFSWatcher* ptr = new (NotNull, JSC::allocateCell<JSFSWatcher>(vm)) JSFSWatcher(vm, structure, ctx);
+    ptr->finishCreation(vm);
+    return ptr;
+}
+
+extern "C" void* FSWatcher__fromJS(JSC::EncodedJSValue value)
+{
+    JSC::JSValue decodedValue = JSC::JSValue::decode(value);
+    if (decodedValue.isEmpty() || !decodedValue.isCell())
+        return nullptr;
+
+    JSC::JSCell* cell = decodedValue.asCell();
+    JSFSWatcher* object = JSC::jsDynamicCast<JSFSWatcher*>(cell);
+
+    if (!object)
+        return nullptr;
+
+    return object->wrapped();
+}
+
+extern "C" bool FSWatcher__dangerouslySetPtr(JSC::EncodedJSValue value, void* ptr)
+{
+    JSFSWatcher* object = JSC::jsDynamicCast<JSFSWatcher*>(JSValue::decode(value));
+    if (!object)
+        return false;
+
+    object->m_ctx = ptr;
+    return true;
+}
+
+extern "C" const size_t FSWatcher__ptrOffset = JSFSWatcher::offsetOfWrapped();
+
+void JSFSWatcher::analyzeHeap(JSCell* cell, HeapAnalyzer& analyzer)
+{
+    auto* thisObject = jsCast<JSFSWatcher*>(cell);
+    if (void* wrapped = thisObject->wrapped()) {
+        // if (thisObject->scriptExecutionContext())
+        //     analyzer.setLabelForCell(cell, "url " + thisObject->scriptExecutionContext()->url().string());
+    }
+    Base::analyzeHeap(cell, analyzer);
+}
+
+JSObject* JSFSWatcher::createPrototype(VM& vm, JSDOMGlobalObject* globalObject)
+{
+    return JSFSWatcherPrototype::create(vm, globalObject, JSFSWatcherPrototype::createStructure(vm, globalObject, globalObject->objectPrototype()));
+}
+
+extern "C" EncodedJSValue FSWatcher__create(Zig::GlobalObject* globalObject, void* ptr)
+{
+    auto& vm = globalObject->vm();
+    JSC::Structure* structure = globalObject->JSFSWatcherStructure();
+    JSFSWatcher* instance = JSFSWatcher::create(vm, globalObject, structure, ptr);
+
+    return JSValue::encode(instance);
+}
+
+template<typename Visitor>
+void JSFSWatcher::visitChildrenImpl(JSCell* cell, Visitor& visitor)
+{
+    JSFSWatcher* thisObject = jsCast<JSFSWatcher*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    Base::visitChildren(thisObject, visitor);
+    visitor.append(thisObject->m_listener);
+
+    visitor.addOpaqueRoot(thisObject->wrapped());
+}
+
+DEFINE_VISIT_CHILDREN(JSFSWatcher);
+
+template<typename Visitor>
+void JSFSWatcher::visitAdditionalChildren(Visitor& visitor)
+{
+    JSFSWatcher* thisObject = this;
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    visitor.append(thisObject->m_listener);
+
+    visitor.addOpaqueRoot(this->wrapped());
+}
+
+DEFINE_VISIT_ADDITIONAL_CHILDREN(JSFSWatcher);
+
+template<typename Visitor>
+void JSFSWatcher::visitOutputConstraintsImpl(JSCell* cell, Visitor& visitor)
+{
+    JSFSWatcher* thisObject = jsCast<JSFSWatcher*>(cell);
+    ASSERT_GC_OBJECT_INHERITS(thisObject, info());
+    thisObject->visitAdditionalChildren<Visitor>(visitor);
+}
+
+DEFINE_VISIT_OUTPUT_CONSTRAINTS(JSFSWatcher);
 class JSFileSystemRouterPrototype final : public JSC::JSNonFinalObject {
 public:
     using Base = JSC::JSNonFinalObject;
@@ -7654,6 +7986,9 @@ JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__utimesCallback);
 extern "C" EncodedJSValue NodeJSFSPrototype__utimesSync(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__utimesSyncCallback);
 
+extern "C" EncodedJSValue NodeJSFSPrototype__watch(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__watchCallback);
+
 extern "C" EncodedJSValue NodeJSFSPrototype__write(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(NodeJSFSPrototype__writeCallback);
 
@@ -7751,6 +8086,7 @@ static const HashTableValue JSNodeJSFSPrototypeTableValues[] = {
     { "unlinkSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__unlinkSyncCallback, 1 } },
     { "utimes"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__utimesCallback, 4 } },
     { "utimesSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__utimesSyncCallback, 3 } },
+    { "watch"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__watchCallback, 3 } },
     { "write"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__writeCallback, 6 } },
     { "writeFile"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__writeFileCallback, 4 } },
     { "writeFileSync"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, NodeJSFSPrototype__writeFileSyncCallback, 3 } },
@@ -9795,6 +10131,33 @@ JSC_DEFINE_HOST_FUNCTION(NodeJSFSPrototype__utimesSyncCallback, (JSGlobalObject
     return NodeJSFSPrototype__utimesSync(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(NodeJSFSPrototype__watchCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSNodeJSFS* thisObject = jsDynamicCast<JSNodeJSFS*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return NodeJSFSPrototype__watch(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(NodeJSFSPrototype__writeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -16509,6 +16872,9 @@ extern "C" void* TCPSocketClass__construct(JSC::JSGlobalObject*, JSC::CallFrame*
 JSC_DECLARE_CUSTOM_GETTER(jsTCPSocketConstructor);
 extern "C" void TCPSocketClass__finalize(void*);
 
+extern "C" JSC::EncodedJSValue TCPSocketPrototype__getALPNProtocol(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
+JSC_DECLARE_CUSTOM_GETTER(TCPSocketPrototype__alpnProtocolGetterWrap);
+
 extern "C" JSC::EncodedJSValue TCPSocketPrototype__getAuthorized(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TCPSocketPrototype__authorizedGetterWrap);
 
@@ -16545,6 +16911,9 @@ JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__reloadCallback);
 extern "C" JSC::EncodedJSValue TCPSocketPrototype__getRemoteAddress(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TCPSocketPrototype__remoteAddressGetterWrap);
 
+extern "C" EncodedJSValue TCPSocketPrototype__setServername(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__setServernameCallback);
+
 extern "C" EncodedJSValue TCPSocketPrototype__shutdown(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__shutdownCallback);
 
@@ -16554,12 +16923,16 @@ JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__timeoutCallback);
 extern "C" EncodedJSValue TCPSocketPrototype__unref(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__unrefCallback);
 
+extern "C" EncodedJSValue TCPSocketPrototype__upgradeTLS(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__upgradeTLSCallback);
+
 extern "C" EncodedJSValue TCPSocketPrototype__write(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TCPSocketPrototype__writeCallback);
 
 STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTCPSocketPrototype, JSTCPSocketPrototype::Base);
 
 static const HashTableValue JSTCPSocketPrototypeTableValues[] = {
+    { "alpnProtocol"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__alpnProtocolGetterWrap, 0 } },
     { "authorized"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__authorizedGetterWrap, 0 } },
     { "data"_s, static_cast<unsigned>(JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__dataGetterWrap, TCPSocketPrototype__dataSetterWrap } },
     { "end"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__endCallback, 3 } },
@@ -16571,9 +16944,11 @@ static const HashTableValue JSTCPSocketPrototypeTableValues[] = {
     { "ref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__refCallback, 0 } },
     { "reload"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__reloadCallback, 1 } },
     { "remoteAddress"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TCPSocketPrototype__remoteAddressGetterWrap, 0 } },
+    { "setServername"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__setServernameCallback, 1 } },
     { "shutdown"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__shutdownCallback, 1 } },
     { "timeout"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__timeoutCallback, 1 } },
     { "unref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__unrefCallback, 0 } },
+    { "upgradeTLS"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__upgradeTLSCallback, 1 } },
     { "write"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TCPSocketPrototype__writeCallback, 3 } }
 };
 
@@ -16591,6 +16966,18 @@ JSC_DEFINE_CUSTOM_GETTER(jsTCPSocketConstructor, (JSGlobalObject * lexicalGlobal
     return JSValue::encode(globalObject->JSTCPSocketConstructor());
 }
 
+JSC_DEFINE_CUSTOM_GETTER(TCPSocketPrototype__alpnProtocolGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
+{
+    auto& vm = lexicalGlobalObject->vm();
+    Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    JSTCPSocket* thisObject = jsCast<JSTCPSocket*>(JSValue::decode(thisValue));
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+    JSC::EncodedJSValue result = TCPSocketPrototype__getALPNProtocol(thisObject->wrapped(), globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+    RELEASE_AND_RETURN(throwScope, result);
+}
+
 JSC_DEFINE_CUSTOM_GETTER(TCPSocketPrototype__authorizedGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -16847,6 +17234,33 @@ extern "C" EncodedJSValue TCPSocketPrototype__remoteAddressGetCachedValue(JSC::E
     return JSValue::encode(thisObject->m_remoteAddress.get());
 }
 
+JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__setServernameCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTCPSocket* thisObject = jsDynamicCast<JSTCPSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TCPSocketPrototype__setServername(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__shutdownCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -16928,6 +17342,33 @@ JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__unrefCallback, (JSGlobalObject * le
     return TCPSocketPrototype__unref(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__upgradeTLSCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTCPSocket* thisObject = jsDynamicCast<JSTCPSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TCPSocketPrototype__upgradeTLS(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TCPSocketPrototype__writeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -17116,6 +17557,9 @@ extern "C" void* TLSSocketClass__construct(JSC::JSGlobalObject*, JSC::CallFrame*
 JSC_DECLARE_CUSTOM_GETTER(jsTLSSocketConstructor);
 extern "C" void TLSSocketClass__finalize(void*);
 
+extern "C" JSC::EncodedJSValue TLSSocketPrototype__getALPNProtocol(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
+JSC_DECLARE_CUSTOM_GETTER(TLSSocketPrototype__alpnProtocolGetterWrap);
+
 extern "C" JSC::EncodedJSValue TLSSocketPrototype__getAuthorized(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TLSSocketPrototype__authorizedGetterWrap);
 
@@ -17152,6 +17596,9 @@ JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__reloadCallback);
 extern "C" JSC::EncodedJSValue TLSSocketPrototype__getRemoteAddress(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject);
 JSC_DECLARE_CUSTOM_GETTER(TLSSocketPrototype__remoteAddressGetterWrap);
 
+extern "C" EncodedJSValue TLSSocketPrototype__setServername(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__setServernameCallback);
+
 extern "C" EncodedJSValue TLSSocketPrototype__shutdown(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__shutdownCallback);
 
@@ -17161,12 +17608,16 @@ JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__timeoutCallback);
 extern "C" EncodedJSValue TLSSocketPrototype__unref(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__unrefCallback);
 
+extern "C" EncodedJSValue TLSSocketPrototype__upgradeTLS(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
+JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__upgradeTLSCallback);
+
 extern "C" EncodedJSValue TLSSocketPrototype__write(void* ptr, JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame);
 JSC_DECLARE_HOST_FUNCTION(TLSSocketPrototype__writeCallback);
 
 STATIC_ASSERT_ISO_SUBSPACE_SHARABLE(JSTLSSocketPrototype, JSTLSSocketPrototype::Base);
 
 static const HashTableValue JSTLSSocketPrototypeTableValues[] = {
+    { "alpnProtocol"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__alpnProtocolGetterWrap, 0 } },
     { "authorized"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__authorizedGetterWrap, 0 } },
     { "data"_s, static_cast<unsigned>(JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__dataGetterWrap, TLSSocketPrototype__dataSetterWrap } },
     { "end"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__endCallback, 3 } },
@@ -17178,9 +17629,11 @@ static const HashTableValue JSTLSSocketPrototypeTableValues[] = {
     { "ref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__refCallback, 0 } },
     { "reload"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__reloadCallback, 1 } },
     { "remoteAddress"_s, static_cast<unsigned>(JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DOMAttribute | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::GetterSetterType, TLSSocketPrototype__remoteAddressGetterWrap, 0 } },
+    { "setServername"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__setServernameCallback, 1 } },
     { "shutdown"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__shutdownCallback, 1 } },
     { "timeout"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__timeoutCallback, 1 } },
     { "unref"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__unrefCallback, 0 } },
+    { "upgradeTLS"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__upgradeTLSCallback, 1 } },
     { "write"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function | PropertyAttribute::DontDelete), NoIntrinsic, { HashTableValue::NativeFunctionType, TLSSocketPrototype__writeCallback, 3 } }
 };
 
@@ -17198,6 +17651,18 @@ JSC_DEFINE_CUSTOM_GETTER(jsTLSSocketConstructor, (JSGlobalObject * lexicalGlobal
     return JSValue::encode(globalObject->JSTLSSocketConstructor());
 }
 
+JSC_DEFINE_CUSTOM_GETTER(TLSSocketPrototype__alpnProtocolGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
+{
+    auto& vm = lexicalGlobalObject->vm();
+    Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
+    auto throwScope = DECLARE_THROW_SCOPE(vm);
+    JSTLSSocket* thisObject = jsCast<JSTLSSocket*>(JSValue::decode(thisValue));
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+    JSC::EncodedJSValue result = TLSSocketPrototype__getALPNProtocol(thisObject->wrapped(), globalObject);
+    RETURN_IF_EXCEPTION(throwScope, {});
+    RELEASE_AND_RETURN(throwScope, result);
+}
+
 JSC_DEFINE_CUSTOM_GETTER(TLSSocketPrototype__authorizedGetterWrap, (JSGlobalObject * lexicalGlobalObject, EncodedJSValue thisValue, PropertyName attributeName))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -17454,6 +17919,33 @@ extern "C" EncodedJSValue TLSSocketPrototype__remoteAddressGetCachedValue(JSC::E
     return JSValue::encode(thisObject->m_remoteAddress.get());
 }
 
+JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__setServernameCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTLSSocket* thisObject = jsDynamicCast<JSTLSSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TLSSocketPrototype__setServername(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__shutdownCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
@@ -17535,6 +18027,33 @@ JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__unrefCallback, (JSGlobalObject * le
     return TLSSocketPrototype__unref(thisObject->wrapped(), lexicalGlobalObject, callFrame);
 }
 
+JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__upgradeTLSCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
+{
+    auto& vm = lexicalGlobalObject->vm();
+
+    JSTLSSocket* thisObject = jsDynamicCast<JSTLSSocket*>(callFrame->thisValue());
+
+    if (UNLIKELY(!thisObject)) {
+        auto throwScope = DECLARE_THROW_SCOPE(vm);
+        return throwVMTypeError(lexicalGlobalObject, throwScope);
+    }
+
+    JSC::EnsureStillAliveScope thisArg = JSC::EnsureStillAliveScope(thisObject);
+
+#ifdef BUN_DEBUG
+    /** View the file name of the JS file that called this function
+     * from a debugger */
+    SourceOrigin sourceOrigin = callFrame->callerSourceOrigin(vm);
+    const char* fileName = sourceOrigin.string().utf8().data();
+    static const char* lastFileName = nullptr;
+    if (lastFileName != fileName) {
+        lastFileName = fileName;
+    }
+#endif
+
+    return TLSSocketPrototype__upgradeTLS(thisObject->wrapped(), lexicalGlobalObject, callFrame);
+}
+
 JSC_DEFINE_HOST_FUNCTION(TLSSocketPrototype__writeCallback, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
 {
     auto& vm = lexicalGlobalObject->vm();
diff --git a/src/bun.js/bindings/ZigGeneratedClasses.h b/src/bun.js/bindings/ZigGeneratedClasses.h
index 668cd3f6b..1631f960e 100644
--- a/src/bun.js/bindings/ZigGeneratedClasses.h
+++ b/src/bun.js/bindings/ZigGeneratedClasses.h
@@ -578,6 +578,89 @@ public:
     mutable JSC::WriteBarrier<JSC::Unknown> m_testValue;
 };
 
+class JSFSWatcher final : public JSC::JSDestructibleObject {
+public:
+    using Base = JSC::JSDestructibleObject;
+    static JSFSWatcher* create(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::Structure* structure, void* ctx);
+
+    DECLARE_EXPORT_INFO;
+    template<typename, JSC::SubspaceAccess mode> static JSC::GCClient::IsoSubspace* subspaceFor(JSC::VM& vm)
+    {
+        if constexpr (mode == JSC::SubspaceAccess::Concurrently)
+            return nullptr;
+        return WebCore::subspaceForImpl<JSFSWatcher, WebCore::UseCustomHeapCellType::No>(
+            vm,
+            [](auto& spaces) { return spaces.m_clientSubspaceForFSWatcher.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_clientSubspaceForFSWatcher = std::forward<decltype(space)>(space); },
+            [](auto& spaces) { return spaces.m_subspaceForFSWatcher.get(); },
+            [](auto& spaces, auto&& space) { spaces.m_subspaceForFSWatcher = std::forward<decltype(space)>(space); });
+    }
+
+    static void destroy(JSC::JSCell*);
+    static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
+    {
+        return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(static_cast<JSC::JSType>(0b11101110), StructureFlags), info());
+    }
+
+    static JSObject* createPrototype(VM& vm, JSDOMGlobalObject* globalObject);
+    ;
+
+    ~JSFSWatcher();
+
+    void* wrapped() const { return m_ctx; }
+
+    void detach()
+    {
+        m_ctx = nullptr;
+    }
+
+    static void analyzeHeap(JSCell*, JSC::HeapAnalyzer&);
+    static ptrdiff_t offsetOfWrapped() { return OBJECT_OFFSETOF(JSFSWatcher, m_ctx); }
+
+    void* m_ctx { nullptr };
+
+    JSFSWatcher(JSC::VM& vm, JSC::Structure* structure, void* sinkPtr)
+        : Base(vm, structure)
+    {
+        m_ctx = sinkPtr;
+        m_weakThis = JSC::Weak<JSFSWatcher>(this, getOwner());
+    }
+
+    void finishCreation(JSC::VM&);
+
+    JSC::Weak<JSFSWatcher> m_weakThis;
+
+    static bool hasPendingActivity(void* ctx);
+
+    class Owner final : public JSC::WeakHandleOwner {
+    public:
+        bool isReachableFromOpaqueRoots(JSC::Handle<JSC::Unknown> handle, void* context, JSC::AbstractSlotVisitor& visitor, const char** reason) final
+        {
+            auto* controller = JSC::jsCast<JSFSWatcher*>(handle.slot()->asCell());
+            if (JSFSWatcher::hasPendingActivity(controller->wrapped())) {
+                if (UNLIKELY(reason))
+                    *reason = "has pending activity";
+                return true;
+            }
+
+            return visitor.containsOpaqueRoot(context);
+        }
+        void finalize(JSC::Handle<JSC::Unknown>, void* context) final {}
+    };
+
+    static JSC::WeakHandleOwner* getOwner()
+    {
+        static NeverDestroyed<Owner> m_owner;
+        return &m_owner.get();
+    }
+
+    DECLARE_VISIT_CHILDREN;
+    template<typename Visitor> void visitAdditionalChildren(Visitor&);
+    DECLARE_VISIT_OUTPUT_CONSTRAINTS;
+
+    mutable JSC::WriteBarrier<JSC::Unknown> m_listener;
+};
+
 class JSFileSystemRouter final : public JSC::JSDestructibleObject {
 public:
     using Base = JSC::JSDestructibleObject;
diff --git a/src/bun.js/bindings/ZigGlobalObject.cpp b/src/bun.js/bindings/ZigGlobalObject.cpp
index e49b94687..91d365af6 100644
--- a/src/bun.js/bindings/ZigGlobalObject.cpp
+++ b/src/bun.js/bindings/ZigGlobalObject.cpp
@@ -181,6 +181,8 @@ namespace JSCastingHelpers = JSC::JSCastingHelpers;
 #include "DOMWrapperWorld-class.h"
 #include "CommonJSModuleRecord.h"
 #include <wtf/RAMSize.h>
+#include <wtf/text/Base64.h>
+#include "simdutf.h"
 
 constexpr size_t DEFAULT_ERROR_STACK_TRACE_LIMIT = 10;
 
@@ -194,6 +196,24 @@ constexpr size_t DEFAULT_ERROR_STACK_TRACE_LIMIT = 10;
 // #include <iostream>
 static bool has_loaded_jsc = false;
 
+namespace WebCore {
+class Base64Utilities {
+public:
+    static ExceptionOr<String> atob(const String& encodedString)
+    {
+        if (encodedString.isNull())
+            return String();
+
+        auto decodedData = base64Decode(encodedString, Base64DecodeMode::DefaultValidatePaddingAndIgnoreWhitespace);
+        if (!decodedData)
+            return Exception { InvalidCharacterError };
+
+        return String(decodedData->data(), decodedData->size());
+    }
+};
+
+}
+
 extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(const char* ptr, size_t length))
 {
     if (has_loaded_jsc)
@@ -219,7 +239,9 @@ extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(c
         JSC::Options::useJITCage() = false;
         JSC::Options::useShadowRealm() = true;
         JSC::Options::useResizableArrayBuffer() = true;
+#ifdef BUN_DEBUG
         JSC::Options::showPrivateScriptsInStackTraces() = true;
+#endif
         JSC::Options::useSetMethods() = true;
 
         /*
@@ -280,7 +302,13 @@ extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(c
         // crypto.createHash("sha1")    985.26 ns/iter    (956.7 ns … 1.12 µs)      1 µs   1.12 µs   1.12 µs
         // Peak memory usage: 56 MB
         size_t ramSize = WTF::ramSize();
-        ramSize /= 1024;
+
+        // We originally went with a hardcoded /= 1024 here
+        // But if you don't have much memory, that becomes a problem.
+        // Instead, we do 65%
+        double ramSizeDouble = static_cast<double>(ramSize);
+        ramSizeDouble *= 0.65;
+        ramSize = static_cast<size_t>(ramSizeDouble);
 
         if (ramSize > 0) {
             JSC::Options::forceRAMSize() = ramSize;
@@ -306,6 +334,140 @@ extern "C" void JSCInitialize(const char* envp[], size_t envc, void (*onCrash)(c
 }
 
 extern "C" void* Bun__getVM();
+extern "C" JSGlobalObject* Bun__getDefaultGlobal();
+
+// Error.captureStackTrace may cause computeErrorInfo to be called twice
+// Rather than figure out the plumbing in JSC, we just skip the next call
+// TODO: thread_local for workers
+static bool skipNextComputeErrorInfo = false;
+
+// error.stack calls this function
+static String computeErrorInfoWithoutPrepareStackTrace(JSC::VM& vm, Vector<StackFrame>& stackTrace, unsigned& line, unsigned& column, String& sourceURL, JSObject* errorInstance)
+{
+    if (!errorInstance) {
+        return String();
+    }
+
+    if (skipNextComputeErrorInfo) {
+        return String();
+    }
+
+    Zig::GlobalObject* globalObject = jsDynamicCast<Zig::GlobalObject*>(errorInstance->globalObject());
+    if (!globalObject) {
+        // Happens in node:vm
+        globalObject = jsDynamicCast<Zig::GlobalObject*>(Bun__getDefaultGlobal());
+    }
+
+    WTF::String name = "Error"_s;
+    WTF::String message;
+
+    if (errorInstance) {
+        // Note that we are not allowed to allocate memory in here. It's called inside a finalizer.
+        if (auto* instance = jsDynamicCast<ErrorInstance*>(errorInstance)) {
+            name = instance->sanitizedNameString(globalObject);
+            message = instance->sanitizedMessageString(globalObject);
+        }
+    }
+
+    WTF::StringBuilder sb;
+
+    if (!name.isEmpty()) {
+        sb.append(name);
+        sb.append(": "_s);
+    }
+
+    if (!message.isEmpty()) {
+        sb.append(message);
+    }
+
+    if (stackTrace.isEmpty()) {
+        return sb.toString();
+    }
+
+    if ((!message.isEmpty() || !name.isEmpty())) {
+        sb.append("\n"_s);
+    }
+
+    size_t framesCount = stackTrace.size();
+    ZigStackFrame remappedFrames[framesCount];
+    bool hasSet = false;
+    for (size_t i = 0; i < framesCount; i++) {
+        StackFrame& frame = stackTrace.at(i);
+
+        sb.append("    at "_s);
+
+        WTF::String functionName = frame.functionName(vm);
+
+        if (auto codeblock = frame.codeBlock()) {
+            if (codeblock->isConstructor()) {
+                sb.append("new "_s);
+            }
+
+            // TODO: async
+        }
+
+        if (functionName.isEmpty()) {
+            sb.append("<anonymous>"_s);
+        } else {
+            sb.append(functionName);
+        }
+
+        sb.append(" ("_s);
+
+        if (frame.hasLineAndColumnInfo()) {
+            unsigned int thisLine = 0;
+            unsigned int thisColumn = 0;
+            frame.computeLineAndColumn(thisLine, thisColumn);
+            remappedFrames[i].position.line = thisLine;
+            remappedFrames[i].position.column_start = thisColumn;
+            String sourceURLForFrame = frame.sourceURL(vm);
+
+            if (!sourceURLForFrame.isEmpty()) {
+                remappedFrames[i].source_url = Bun::toString(sourceURLForFrame);
+            } else {
+                // https://github.com/oven-sh/bun/issues/3595
+                remappedFrames[i].source_url = BunStringEmpty;
+            }
+
+            // This ensures the lifetime of the sourceURL is accounted for correctly
+            Bun__remapStackFramePositions(globalObject, remappedFrames + i, 1);
+
+            if (!hasSet) {
+                hasSet = true;
+                line = thisLine;
+                column = thisColumn;
+                sourceURL = frame.sourceURL(vm);
+
+                if (errorInstance) {
+                    if (remappedFrames[i].remapped) {
+                        errorInstance->putDirect(vm, Identifier::fromString(vm, "originalLine"_s), jsNumber(thisLine), 0);
+                        errorInstance->putDirect(vm, Identifier::fromString(vm, "originalColumn"_s), jsNumber(thisColumn), 0);
+                    }
+                }
+            }
+
+            sb.append(sourceURLForFrame);
+            sb.append(":"_s);
+            sb.append(remappedFrames[i].position.line);
+            sb.append(":"_s);
+            sb.append(remappedFrames[i].position.column_start);
+        } else {
+            sb.append("native"_s);
+        }
+        sb.append(")"_s);
+
+        if (i != framesCount - 1) {
+            sb.append("\n"_s);
+        }
+    }
+
+    return sb.toString();
+}
+
+static String computeErrorInfo(JSC::VM& vm, Vector<StackFrame>& stackTrace, unsigned& line, unsigned& column, String& sourceURL, JSObject* errorInstance)
+{
+    return computeErrorInfoWithoutPrepareStackTrace(vm, stackTrace, line, column, sourceURL, errorInstance);
+}
 
 extern "C" JSC__JSGlobalObject* Zig__GlobalObject__create(JSClassRef* globalObjectClass, int count,
     void* console_client)
@@ -323,6 +485,9 @@ extern "C" JSC__JSGlobalObject* Zig__GlobalObject__create(JSClassRef* globalObje
     Zig::GlobalObject* globalObject = Zig::GlobalObject::create(vm, Zig::GlobalObject::createStructure(vm, JSC::JSGlobalObject::create(vm, JSC::JSGlobalObject::createStructure(vm, JSC::jsNull())), JSC::jsNull()));
     globalObject->setConsole(globalObject);
     globalObject->isThreadLocalDefaultGlobalObject = true;
+    globalObject->setStackTraceLimit(DEFAULT_ERROR_STACK_TRACE_LIMIT); // Node.js defaults to 10
+    vm.setOnComputeErrorInfo(computeErrorInfo);
+
     if (count > 0) {
         globalObject->installAPIGlobals(globalObjectClass, count, vm);
     }
@@ -361,8 +526,8 @@ JSC_DEFINE_HOST_FUNCTION(functionFulfillModuleSync,
         &specifier,
         &specifier);
 
-    if (result.isUndefined() || !result) {
-        return JSValue::encode(result);
+    if (scope.exception() || !result) {
+        RELEASE_AND_RETURN(scope, JSValue::encode(JSC::jsUndefined()));
     }
 
     globalObject->moduleLoader()->provideFetch(globalObject, key, jsCast<JSC::JSSourceCode*>(result)->sourceCode());
@@ -869,6 +1034,21 @@ JSC_DEFINE_HOST_FUNCTION(functionBunSleepThenCallback,
     return JSC::JSValue::encode(promise);
 }
 
+using MicrotaskCallback = void (*)(void*);
+
+JSC_DEFINE_HOST_FUNCTION(functionNativeMicrotaskTrampoline,
+    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+{
+    // Do not use JSCell* here because the GC will try to visit it.
+    double cellPtr = callFrame->uncheckedArgument(0).asNumber();
+    double callbackPtr = callFrame->uncheckedArgument(1).asNumber();
+
+    void* cell = reinterpret_cast<void*>(bitwise_cast<uintptr_t>(cellPtr));
+    auto* callback = reinterpret_cast<MicrotaskCallback>(bitwise_cast<uintptr_t>(callbackPtr));
+    callback(cell);
+    return JSValue::encode(jsUndefined());
+}
+
 JSC_DEFINE_HOST_FUNCTION(functionBunSleep,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
@@ -1032,53 +1212,69 @@ JSC_DEFINE_HOST_FUNCTION(functionBTOA,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
 
     if (callFrame->argumentCount() == 0) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        JSC::throwTypeError(globalObject, scope, "btoa requires 1 argument (a string)"_s);
+        JSC::throwTypeError(globalObject, throwScope, "btoa requires 1 argument (a string)"_s);
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    const String& stringToEncode = callFrame->argument(0).toWTFString(globalObject);
+    JSValue arg0 = callFrame->uncheckedArgument(0);
+    WTF::String encodedString = arg0.toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
 
-    if (!stringToEncode || stringToEncode.isNull()) {
-        return JSC::JSValue::encode(JSC::jsString(vm, WTF::String()));
+    if (encodedString.isEmpty()) {
+        return JSC::JSValue::encode(JSC::jsEmptyString(vm));
     }
 
-    if (!stringToEncode.isAllLatin1()) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        throwException(globalObject, scope, createDOMException(globalObject, ExceptionCode::InvalidCharacterError));
+    if (!encodedString.isAllLatin1()) {
+        throwException(globalObject, throwScope, createDOMException(globalObject, InvalidCharacterError));
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    return JSC::JSValue::encode(JSC::jsString(vm, WTF::base64EncodeToString(stringToEncode.latin1())));
+    // Reminder: btoa() is for Byte Strings
+    // Specifically: latin1 byte strings
+    // That means even though this looks like the wrong thing to do,
+    // we should be converting to latin1, not utf8.
+    if (!encodedString.is8Bit()) {
+        LChar* ptr;
+        unsigned length = encodedString.length();
+        auto dest = WTF::String::createUninitialized(length, ptr);
+        WTF::StringImpl::copyCharacters(ptr, encodedString.characters16(), length);
+        encodedString = WTFMove(dest);
+    }
+
+    unsigned length = encodedString.length();
+    RELEASE_AND_RETURN(
+        throwScope,
+        Bun__encoding__toString(
+            encodedString.characters8(),
+            length,
+            globalObject,
+            static_cast<uint8_t>(WebCore::BufferEncodingType::base64)));
 }
 
 static JSC_DEFINE_HOST_FUNCTION(functionATOB,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
+    auto throwScope = DECLARE_THROW_SCOPE(globalObject->vm());
 
     if (callFrame->argumentCount() == 0) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        JSC::throwTypeError(globalObject, scope, "atob requires 1 argument (a string)"_s);
+        JSC::throwTypeError(globalObject, throwScope, "atob requires 1 argument (a string)"_s);
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    const WTF::String& encodedString = callFrame->argument(0).toWTFString(globalObject);
-
-    if (encodedString.isNull()) {
-        return JSC::JSValue::encode(JSC::jsEmptyString(vm));
-    }
+    WTF::String encodedString = callFrame->uncheckedArgument(0).toWTFString(globalObject);
+    RETURN_IF_EXCEPTION(throwScope, JSC::JSValue::encode(JSC::JSValue {}));
 
-    auto decodedData = WTF::base64Decode(encodedString, Base64DecodeMode::DefaultValidatePaddingAndIgnoreWhitespace);
-    if (!decodedData) {
-        auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
-        throwException(globalObject, scope, createDOMException(globalObject, ExceptionCode::InvalidCharacterError));
+    auto result = WebCore::Base64Utilities::atob(encodedString);
+    if (result.hasException()) {
+        throwException(globalObject, throwScope, createDOMException(*globalObject, result.releaseException()));
         return JSC::JSValue::encode(JSC::JSValue {});
     }
 
-    return JSC::JSValue::encode(JSC::jsString(vm, WTF::String(decodedData->data(), decodedData->size())));
+    RELEASE_AND_RETURN(throwScope, JSValue::encode(jsString(vm, result.releaseReturnValue())));
 }
 
 static JSC_DEFINE_HOST_FUNCTION(functionHashCode,
@@ -1271,10 +1467,12 @@ JSC_DEFINE_HOST_FUNCTION(functionCallNotImplemented,
 
 // we're trying out a new way to do this lazy loading
 static JSC_DEFINE_HOST_FUNCTION(functionLazyLoad,
-    (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
+    (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callFrame))
 {
 JSC:
+    Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject);
     VM& vm = globalObject->vm();
+
     switch (callFrame->argumentCount()) {
     case 0: {
         auto scope = DECLARE_THROW_SCOPE(globalObject->vm());
@@ -1283,13 +1481,6 @@ JSC:
         return JSC::JSValue::encode(JSC::JSValue {});
     }
     default: {
-        static NeverDestroyed<const String> sqliteString(MAKE_STATIC_STRING_IMPL("sqlite"));
-        static NeverDestroyed<const String> bunJSCString(MAKE_STATIC_STRING_IMPL("bun:jsc"));
-        static NeverDestroyed<const String> bunStreamString(MAKE_STATIC_STRING_IMPL("bun:stream"));
-        static NeverDestroyed<const String> noopString(MAKE_STATIC_STRING_IMPL("noop"));
-        static NeverDestroyed<const String> createImportMeta(MAKE_STATIC_STRING_IMPL("createImportMeta"));
-        static NeverDestroyed<const String> masqueradesAsUndefined(MAKE_STATIC_STRING_IMPL("masqueradesAsUndefined"));
-        static NeverDestroyed<const String> vmString(MAKE_STATIC_STRING_IMPL("vm"));
 
         JSC::JSValue moduleName = callFrame->argument(0);
         if (moduleName.isNumber()) {
@@ -1328,24 +1519,24 @@ JSC:
             return JSC::JSValue::encode(JSC::JSValue {});
         }
 
-        if (string == sqliteString) {
+        if (string == "sqlite"_s) {
             return JSC::JSValue::encode(JSSQLStatementConstructor::create(vm, globalObject, JSSQLStatementConstructor::createStructure(vm, globalObject, globalObject->m_functionPrototype.get())));
         }
 
-        if (string == bunJSCString) {
+        if (string == "bun:jsc"_s) {
             return JSC::JSValue::encode(createJSCModule(globalObject));
         }
 
-        if (string == pathToFileURLString) {
+        if (string == "pathToFileURL"_s) {
             return JSValue::encode(
                 JSFunction::create(vm, globalObject, 1, pathToFileURLString, functionPathToFileURL, ImplementationVisibility::Public, NoIntrinsic));
         }
-        if (string == fileURLToPathString) {
+        if (string == "fileURLToPath"_s) {
             return JSValue::encode(
                 JSFunction::create(vm, globalObject, 1, fileURLToPathString, functionFileURLToPath, ImplementationVisibility::Public, NoIntrinsic));
         }
 
-        if (string == bunStreamString) {
+        if (string == "bun:stream"_s) {
             auto* obj = constructEmptyObject(globalObject);
             obj->putDirect(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "BufferList"_s)), reinterpret_cast<Zig::GlobalObject*>(globalObject)->JSBufferList(), 0);
             obj->putDirect(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "ReadableState"_s)), reinterpret_cast<Zig::GlobalObject*>(globalObject)->JSReadableState(), 0);
@@ -1364,16 +1555,16 @@ JSC:
             return JSValue::encode(obj);
         }
 
-        if (string == createImportMeta) {
+        if (string == "createImportMeta"_s) {
             Zig::ImportMetaObject* obj = Zig::ImportMetaObject::create(globalObject, callFrame->argument(1));
             return JSValue::encode(obj);
         }
 
-        if (string == masqueradesAsUndefined) {
+        if (string == "masqueradesAsUndefined"_s) {
             return JSValue::encode(InternalFunction::createFunctionThatMasqueradesAsUndefined(vm, globalObject, 0, String(), functionCallNotImplemented));
         }
 
-        if (string == vmString) {
+        if (string == "vm"_s) {
             auto* obj = constructEmptyObject(globalObject);
             obj->putDirect(
                 vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "Script"_s)),
@@ -1394,7 +1585,22 @@ JSC:
             return JSValue::encode(obj);
         }
 
-        if (UNLIKELY(string == noopString)) {
+        if (string == "vm"_s) {
+            auto* obj = constructEmptyObject(globalObject);
+        }
+
+        if (string == "primordials"_s) {
+            auto sourceOrigin = callFrame->callerSourceOrigin(vm).url();
+            bool isBuiltin = sourceOrigin.protocolIs("builtin"_s);
+            if (!isBuiltin) {
+                return JSC::JSValue::encode(JSC::jsUndefined());
+            }
+
+            auto* obj = globalObject->primordialsObject();
+            return JSValue::encode(obj);
+        }
+
+        if (UNLIKELY(string == "noop"_s)) {
             auto* obj = constructEmptyObject(globalObject);
             obj->putDirectCustomAccessor(vm, JSC::PropertyName(JSC::Identifier::fromString(vm, "getterSetter"_s)), JSC::CustomGetterSetter::create(vm, noop_getter, noop_setter), 0);
             Zig::JSFFIFunction* function = Zig::JSFFIFunction::create(vm, reinterpret_cast<Zig::GlobalObject*>(globalObject), 0, String(), functionNoop, JSC::NoIntrinsic);
@@ -2569,7 +2775,32 @@ JSC::JSValue GlobalObject::formatStackTrace(JSC::VM& vm, JSC::JSGlobalObject* le
 
 extern "C" EncodedJSValue JSPasswordObject__create(JSC::JSGlobalObject*, bool);
 
-JSC_DECLARE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace);
+JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncAppendStackTrace, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callFrame))
+{
+    GlobalObject* globalObject = reinterpret_cast<GlobalObject*>(lexicalGlobalObject);
+    JSC::VM& vm = globalObject->vm();
+    auto scope = DECLARE_THROW_SCOPE(vm);
+
+    JSC::ErrorInstance* source = jsDynamicCast<JSC::ErrorInstance*>(callFrame->argument(0));
+    JSC::ErrorInstance* destination = jsDynamicCast<JSC::ErrorInstance*>(callFrame->argument(1));
+
+    if (!source || !destination) {
+        throwTypeError(lexicalGlobalObject, scope, "First & second argument must be an Error object"_s);
+        return JSC::JSValue::encode(jsUndefined());
+    }
+
+    if (!destination->stackTrace()) {
+        destination->captureStackTrace(vm, globalObject, 1);
+    }
+
+    if (source->stackTrace()) {
+        destination->stackTrace()->appendVector(*source->stackTrace());
+        source->stackTrace()->clear();
+    }
+
+    return JSC::JSValue::encode(jsUndefined());
+}
+
 JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalObject * lexicalGlobalObject, JSC::CallFrame* callFrame))
 {
     GlobalObject* globalObject = reinterpret_cast<GlobalObject*>(lexicalGlobalObject);
@@ -2584,18 +2815,15 @@ JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalOb
     JSC::JSObject* errorObject = objectArg.asCell()->getObject();
     JSC::JSValue caller = callFrame->argument(1);
 
+    // We cannot use our ErrorInstance::captureStackTrace() fast path here unfortunately.
+    // We need to return these CallSite array objects which means we need to create them
     JSValue errorValue = lexicalGlobalObject->get(lexicalGlobalObject, vm.propertyNames->Error);
     auto* errorConstructor = jsDynamicCast<JSC::JSObject*>(errorValue);
-
-    size_t stackTraceLimit = DEFAULT_ERROR_STACK_TRACE_LIMIT;
-    if (JSC::JSValue stackTraceLimitProp = errorConstructor->getIfPropertyExists(lexicalGlobalObject, vm.propertyNames->stackTraceLimit)) {
-        if (stackTraceLimitProp.isNumber()) {
-            stackTraceLimit = std::min(std::max(static_cast<size_t>(stackTraceLimitProp.toIntegerOrInfinity(lexicalGlobalObject)), 0ul), 2048ul);
-            if (stackTraceLimit == 0) {
-                stackTraceLimit = 2048;
-            }
-        }
+    size_t stackTraceLimit = globalObject->stackTraceLimit().value();
+    if (stackTraceLimit == 0) {
+        stackTraceLimit = DEFAULT_ERROR_STACK_TRACE_LIMIT;
     }
+
     JSCStackTrace stackTrace = JSCStackTrace::captureCurrentJSStackTrace(globalObject, callFrame, stackTraceLimit, caller);
 
     // Create an (uninitialized) array for our "call sites"
@@ -2619,7 +2847,7 @@ JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalOb
     size_t framesCount = stackTrace.size();
     ZigStackFrame remappedFrames[framesCount];
     for (int i = 0; i < framesCount; i++) {
-        remappedFrames[i].source_url = Zig::toZigString(stackTrace.at(i).sourceURL(), lexicalGlobalObject);
+        remappedFrames[i].source_url = Bun::toString(lexicalGlobalObject, stackTrace.at(i).sourceURL());
         if (JSCStackFrame::SourcePositions* sourcePositions = stackTrace.at(i).getSourcePositions()) {
             remappedFrames[i].position.line = sourcePositions->line.zeroBasedInt();
             remappedFrames[i].position.column_start = sourcePositions->startColumn.zeroBasedInt() + 1;
@@ -2652,13 +2880,27 @@ JSC_DEFINE_HOST_FUNCTION(errorConstructorFuncCaptureStackTrace, (JSC::JSGlobalOb
     JSC::JSValue formattedStackTrace = globalObject->formatStackTrace(vm, lexicalGlobalObject, errorObject, callSites);
     RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode({}));
 
+    bool orignialSkipNextComputeErrorInfo = skipNextComputeErrorInfo;
+    skipNextComputeErrorInfo = true;
     if (errorObject->hasProperty(lexicalGlobalObject, vm.propertyNames->stack)) {
+        skipNextComputeErrorInfo = true;
         errorObject->deleteProperty(lexicalGlobalObject, vm.propertyNames->stack);
     }
+    skipNextComputeErrorInfo = orignialSkipNextComputeErrorInfo;
+
     if (formattedStackTrace.isUndefinedOrNull()) {
-        errorObject->putDirect(vm, vm.propertyNames->stack, jsUndefined(), JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::DontEnum);
-    } else {
-        errorObject->putDirect(vm, vm.propertyNames->stack, formattedStackTrace, JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::DontEnum);
+        formattedStackTrace = JSC::jsUndefined();
+    }
+
+    errorObject->putDirect(vm, vm.propertyNames->stack, formattedStackTrace, 0);
+
+    if (auto* instance = jsDynamicCast<JSC::ErrorInstance*>(errorObject)) {
+        // we make a separate copy of the StackTrace unfortunately so that we
+        // can later console.log it without losing the info
+        //
+        // This is not good. We should remove this in the future as it strictly makes this function
+        // already slower than necessary.
+        instance->captureStackTrace(vm, globalObject, 1, false);
     }
 
     RETURN_IF_EXCEPTION(scope, JSC::JSValue::encode(JSValue {}));
@@ -2721,7 +2963,7 @@ void GlobalObject::finishCreation(VM& vm)
             JSC::Structure* structure = globalObject->structureCache().emptyObjectStructureForPrototype(
                 globalObject,
                 globalObject->objectPrototype(),
-                5);
+                3);
             JSC::PropertyOffset offset;
             auto& vm = globalObject->vm();
 
@@ -2735,13 +2977,6 @@ void GlobalObject::finishCreation(VM& vm)
             structure = structure->addPropertyTransition(
                 vm,
                 structure,
-                JSC::Identifier::fromString(vm, "exports"_s),
-                0,
-                offset);
-
-            structure = structure->addPropertyTransition(
-                vm,
-                structure,
                 JSC::Identifier::fromString(vm, "__dirname"_s),
                 0,
                 offset);
@@ -2753,13 +2988,6 @@ void GlobalObject::finishCreation(VM& vm)
                 0,
                 offset);
 
-            structure = structure->addPropertyTransition(
-                vm,
-                structure,
-                JSC::Identifier::fromString(vm, "require"_s),
-                JSC::PropertyAttribute::Function | JSC::PropertyAttribute::Builtin | 0,
-                offset);
-
             init.set(structure);
         });
 
@@ -2832,6 +3060,11 @@ void GlobalObject::finishCreation(VM& vm)
             init.set(JSFunction::create(init.vm, init.owner, 4, "performMicrotaskVariadic"_s, jsFunctionPerformMicrotaskVariadic, ImplementationVisibility::Public));
         });
 
+    m_nativeMicrotaskTrampoline.initLater(
+        [](const Initializer<JSFunction>& init) {
+            init.set(JSFunction::create(init.vm, init.owner, 2, ""_s, functionNativeMicrotaskTrampoline, ImplementationVisibility::Public));
+        });
+
     m_navigatorObject.initLater(
         [](const Initializer<JSObject>& init) {
             int cpuCount = 0;
@@ -2945,11 +3178,7 @@ void GlobalObject::finishCreation(VM& vm)
             Zig::GlobalObject* globalObject = reinterpret_cast<Zig::GlobalObject*>(init.owner);
             auto* process = Zig::Process::create(
                 *globalObject, Zig::Process::createStructure(init.vm, init.owner, WebCore::JSEventEmitter::prototype(init.vm, *globalObject)));
-            process->putDirectCustomAccessor(init.vm, JSC::Identifier::fromString(init.vm, "env"_s),
-                JSC::CustomGetterSetter::create(init.vm, lazyProcessEnvGetter, lazyProcessEnvSetter),
-                JSC::PropertyAttribute::DontDelete
-                    | JSC::PropertyAttribute::CustomValue
-                    | 0);
+
             init.set(process);
         });
 
@@ -2979,14 +3208,20 @@ void GlobalObject::finishCreation(VM& vm)
             init.set(structure);
         });
 
-    m_requireResolveFunctionStructure.initLater(
-        [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::Structure>::Initializer& init) {
-            init.set(Zig::ImportMetaObject::createResolveFunctionStructure(init.vm, jsCast<Zig::GlobalObject*>(init.owner)));
+    m_importMetaRequireFunctionUnbound.initLater(
+        [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::JSObject>::Initializer& init) {
+            init.set(
+                Zig::ImportMetaObject::createRequireFunctionUnbound(init.vm, init.owner));
         });
-
-    m_resolveFunctionPrototype.initLater(
+    m_importMetaRequireResolveFunctionUnbound.initLater(
         [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::JSObject>::Initializer& init) {
-            init.set(Zig::ImportMetaObject::createResolveFunctionPrototype(init.vm, jsCast<Zig::GlobalObject*>(init.owner)).getObject());
+            init.set(
+                Zig::ImportMetaObject::createRequireResolveFunctionUnbound(init.vm, init.owner));
+        });
+
+    m_importMetaObjectStructure.initLater(
+        [](const JSC::LazyProperty<JSC::JSGlobalObject, JSC::Structure>::Initializer& init) {
+            init.set(Zig::ImportMetaObject::createStructure(init.vm, init.owner));
         });
 
     m_JSFileSinkClassStructure.initLater(
@@ -3110,11 +3345,8 @@ void GlobalObject::finishCreation(VM& vm)
     RELEASE_ASSERT(classInfo());
 
     JSC::JSObject* errorConstructor = this->errorConstructor();
-    errorConstructor->putDirectNativeFunctionWithoutTransition(vm, this, JSC::Identifier::fromString(vm, "captureStackTrace"_s), 2, errorConstructorFuncCaptureStackTrace, ImplementationVisibility::Public, JSC::NoIntrinsic, PropertyAttribute::DontEnum | 0);
-
-    // JSC default is 100
-    errorConstructor->putDirect(vm, vm.propertyNames->stackTraceLimit, jsNumber(DEFAULT_ERROR_STACK_TRACE_LIMIT), JSC::PropertyAttribute::DontEnum | 0);
-
+    errorConstructor->putDirectNativeFunction(vm, this, JSC::Identifier::fromString(vm, "captureStackTrace"_s), 2, errorConstructorFuncCaptureStackTrace, ImplementationVisibility::Public, JSC::NoIntrinsic, PropertyAttribute::DontEnum | 0);
+    errorConstructor->putDirectNativeFunction(vm, this, JSC::Identifier::fromString(vm, "appendStackTrace"_s), 2, errorConstructorFuncAppendStackTrace, ImplementationVisibility::Private, JSC::NoIntrinsic, PropertyAttribute::DontEnum | 0);
     JSC::JSValue console = this->get(this, JSC::Identifier::fromString(vm, "console"_s));
     JSC::JSObject* consoleObject = console.getObject();
     consoleObject->putDirectBuiltinFunction(vm, this, vm.propertyNames->asyncIteratorSymbol, consoleObjectAsyncIteratorCodeGenerator(vm), PropertyAttribute::Builtin | PropertyAttribute::DontDelete);
@@ -3434,7 +3666,7 @@ void GlobalObject::addBuiltinGlobals(JSC::VM& vm)
     auto& builtinNames = WebCore::builtinNames(vm);
 
     WTF::Vector<GlobalPropertyInfo> extraStaticGlobals;
-    extraStaticGlobals.reserveCapacity(43);
+    extraStaticGlobals.reserveCapacity(44);
 
     JSC::Identifier queueMicrotaskIdentifier = JSC::Identifier::fromString(vm, "queueMicrotask"_s);
     extraStaticGlobals.uncheckedAppend(
@@ -3563,7 +3795,7 @@ void GlobalObject::addBuiltinGlobals(JSC::VM& vm)
     putDirectBuiltinFunction(vm, this, builtinNames.loadCJS2ESMPrivateName(), importMetaObjectLoadCJS2ESMCodeGenerator(vm), PropertyAttribute::Builtin | PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly);
     putDirectBuiltinFunction(vm, this, builtinNames.internalRequirePrivateName(), importMetaObjectInternalRequireCodeGenerator(vm), PropertyAttribute::Builtin | PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly);
     putDirectNativeFunction(vm, this, builtinNames.createUninitializedArrayBufferPrivateName(), 1, functionCreateUninitializedArrayBuffer, ImplementationVisibility::Public, NoIntrinsic, PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly | PropertyAttribute::Function);
-    putDirectNativeFunction(vm, this, builtinNames.resolveSyncPrivateName(), 1, functionImportMeta__resolveSync, ImplementationVisibility::Public, NoIntrinsic, PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly | PropertyAttribute::Function);
+    putDirectNativeFunction(vm, this, builtinNames.resolveSyncPrivateName(), 1, functionImportMeta__resolveSyncPrivate, ImplementationVisibility::Public, NoIntrinsic, PropertyAttribute::DontDelete | PropertyAttribute::ReadOnly | PropertyAttribute::Function);
 
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "process"_s), JSC::CustomGetterSetter::create(vm, property_lazyProcessGetter, property_lazyProcessSetter),
         JSC::PropertyAttribute::CustomAccessor | 0);
@@ -3655,30 +3887,44 @@ void GlobalObject::addBuiltinGlobals(JSC::VM& vm)
     PUT_WEBCORE_GENERATED_CONSTRUCTOR("Headers"_s, JSFetchHeaders);
     PUT_WEBCORE_GENERATED_CONSTRUCTOR("URLSearchParams"_s, JSURLSearchParams);
 
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().TransformStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableByteStreamControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBRequestPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultWriterPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().AbortSignalPrivateName(), CustomGetterSetter::create(vm, JSDOMAbortSignal_getter, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableByteStreamControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamBYOBRequestPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().ReadableStreamDefaultReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-    putDirectCustomAccessor(vm, static_cast<JSVMClientData*>(vm.clientData)->builtinNames().WritableStreamDefaultWriterPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
-
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.TransformStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_TransformStreamDefaultControllerConstructor, nullptr), attributesForStructure(static_cast<unsigned>(JSC::PropertyAttribute::DontEnum)));
+    putDirectCustomAccessor(vm, builtinNames.ReadableByteStreamControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBRequestPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultReaderPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultControllerPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultWriterPrivateName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), attributesForStructure(JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly));
+    putDirectCustomAccessor(vm, builtinNames.AbortSignalPrivateName(), CustomGetterSetter::create(vm, JSDOMAbortSignal_getter, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableByteStreamControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableByteStreamControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamBYOBRequestPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamBYOBRequestConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.ReadableStreamDefaultReaderPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ReadableStreamDefaultReaderConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultControllerPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultControllerConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+    putDirectCustomAccessor(vm, builtinNames.WritableStreamDefaultWriterPublicName(), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_WritableStreamDefaultWriterConstructor, nullptr), JSC::PropertyAttribute::CustomAccessor | JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
+
+    putDirectNativeFunction(vm, this,
+        builtinNames.createCommonJSModulePrivateName(),
+        2,
+        Bun::jsFunctionCreateCommonJSModule,
+        ImplementationVisibility::Public,
+        NoIntrinsic,
+        JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DontDelete | 0);
+    putDirectNativeFunction(vm, this,
+        builtinNames.evaluateCommonJSModulePrivateName(),
+        2,
+        Bun::jsFunctionLoadModule,
+        ImplementationVisibility::Public,
+        NoIntrinsic,
+        JSC::PropertyAttribute::ReadOnly | JSC::PropertyAttribute::Function | JSC::PropertyAttribute::DontDelete | 0);
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "ByteLengthQueuingStrategy"_s), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_ByteLengthQueuingStrategyConstructor, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "CountQueuingStrategy"_s), CustomGetterSetter::create(vm, jsServiceWorkerGlobalScope_CountQueuingStrategyConstructor, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
     putDirectCustomAccessor(vm, JSC::Identifier::fromString(vm, "SubtleCrypto"_s), JSC::CustomGetterSetter::create(vm, getterSubtleCryptoConstructor, nullptr), JSC::PropertyAttribute::DontDelete | JSC::PropertyAttribute::ReadOnly);
@@ -4013,6 +4259,7 @@ void GlobalObject::visitChildrenImpl(JSCell* cell, Visitor& visitor)
     thisObject->m_JSFileSinkControllerPrototype.visit(visitor);
     thisObject->m_JSHTTPSResponseControllerPrototype.visit(visitor);
     thisObject->m_navigatorObject.visit(visitor);
+    thisObject->m_nativeMicrotaskTrampoline.visit(visitor);
     thisObject->m_performanceObject.visit(visitor);
     thisObject->m_primordialsObject.visit(visitor);
     thisObject->m_processEnvObject.visit(visitor);
@@ -4023,8 +4270,10 @@ void GlobalObject::visitChildrenImpl(JSCell* cell, Visitor& visitor)
     thisObject->m_emitReadableNextTickFunction.visit(visitor);
     thisObject->m_JSBufferSubclassStructure.visit(visitor);
 
-    thisObject->m_requireResolveFunctionStructure.visit(visitor);
-    thisObject->m_resolveFunctionPrototype.visit(visitor);
+    thisObject->m_importMetaRequireFunctionUnbound.visit(visitor);
+    thisObject->m_importMetaRequireResolveFunctionUnbound.visit(visitor);
+    thisObject->m_importMetaObjectStructure.visit(visitor);
+
     thisObject->m_dnsObject.visit(visitor);
     thisObject->m_lazyRequireCacheObject.visit(visitor);
     thisObject->m_vmModuleContextMap.visit(visitor);
@@ -4173,6 +4422,14 @@ extern "C" void JSC__JSGlobalObject__reload(JSC__JSGlobalObject* arg0)
     globalObject->reload();
 }
 
+extern "C" void JSC__JSGlobalObject__queueMicrotaskCallback(Zig::GlobalObject* globalObject, void* ptr, MicrotaskCallback callback)
+{
+    JSFunction* function = globalObject->nativeMicrotaskTrampoline();
+
+    // Do not use JSCell* here because the GC will try to visit it.
+    globalObject->queueMicrotask(function, JSValue(bitwise_cast<double>(reinterpret_cast<uintptr_t>(ptr))), JSValue(bitwise_cast<double>(reinterpret_cast<uintptr_t>(callback))), jsUndefined(), jsUndefined());
+}
+
 JSC::Identifier GlobalObject::moduleLoaderResolve(JSGlobalObject* globalObject,
     JSModuleLoader* loader, JSValue key,
     JSValue referrer, JSValue origin)
diff --git a/src/bun.js/bindings/ZigGlobalObject.h b/src/bun.js/bindings/ZigGlobalObject.h
index 2d69e764f..f44212da1 100644
--- a/src/bun.js/bindings/ZigGlobalObject.h
+++ b/src/bun.js/bindings/ZigGlobalObject.h
@@ -248,8 +248,8 @@ public:
 
     JSC::JSFunction* emitReadableNextTickFunction() { return m_emitReadableNextTickFunction.getInitializedOnMainThread(this); }
 
-    Structure* requireResolveFunctionStructure() { return m_requireResolveFunctionStructure.getInitializedOnMainThread(this); }
-    JSObject* requireResolveFunctionPrototype() { return m_resolveFunctionPrototype.getInitializedOnMainThread(this); }
+    JSObject* importMetaRequireFunctionUnbound() { return m_importMetaRequireFunctionUnbound.getInitializedOnMainThread(this); }
+    JSObject* importMetaRequireResolveFunctionUnbound() { return m_importMetaRequireResolveFunctionUnbound.getInitializedOnMainThread(this); }
 
     JSObject* lazyRequireCacheObject() { return m_lazyRequireCacheObject.getInitializedOnMainThread(this); }
 
@@ -262,6 +262,7 @@ public:
     JSObject* lazyTestModuleObject() { return m_lazyTestModuleObject.getInitializedOnMainThread(this); }
     JSObject* lazyPreloadTestModuleObject() { return m_lazyPreloadTestModuleObject.getInitializedOnMainThread(this); }
     Structure* CommonJSModuleObjectStructure() { return m_commonJSModuleObjectStructure.getInitializedOnMainThread(this); }
+    Structure* ImportMetaObjectStructure() { return m_importMetaObjectStructure.getInitializedOnMainThread(this); }
 
     Structure* commonJSFunctionArgumentsStructure() { return m_commonJSFunctionArgumentsStructure.getInitializedOnMainThread(this); }
 
@@ -269,6 +270,8 @@ public:
 
     JSWeakMap* vmModuleContextMap() { return m_vmModuleContextMap.getInitializedOnMainThread(this); }
 
+    bool hasProcessObject() const { return m_processObject.isInitialized(); }
+
     JSC::JSObject* processObject()
     {
         return m_processObject.getInitializedOnMainThread(this);
@@ -368,6 +371,7 @@ public:
     mutable WriteBarrier<JSFunction> m_thenables[promiseFunctionsSize + 1];
 
     JSObject* navigatorObject();
+    JSFunction* nativeMicrotaskTrampoline() { return m_nativeMicrotaskTrampoline.getInitializedOnMainThread(this); }
 
     void trackFFIFunction(JSC::JSFunction* function)
     {
@@ -465,6 +469,7 @@ private:
      */
     LazyProperty<JSGlobalObject, JSC::Structure> m_pendingVirtualModuleResultStructure;
     LazyProperty<JSGlobalObject, JSFunction> m_performMicrotaskFunction;
+    LazyProperty<JSGlobalObject, JSFunction> m_nativeMicrotaskTrampoline;
     LazyProperty<JSGlobalObject, JSFunction> m_performMicrotaskVariadicFunction;
     LazyProperty<JSGlobalObject, JSFunction> m_emitReadableNextTickFunction;
     LazyProperty<JSGlobalObject, JSMap> m_lazyReadableStreamPrototypeMap;
@@ -481,8 +486,6 @@ private:
     LazyProperty<JSGlobalObject, JSObject> m_subtleCryptoObject;
     LazyProperty<JSGlobalObject, Structure> m_JSHTTPResponseController;
     LazyProperty<JSGlobalObject, JSC::Structure> m_JSBufferSubclassStructure;
-    LazyProperty<JSGlobalObject, JSC::Structure> m_requireResolveFunctionStructure;
-    LazyProperty<JSGlobalObject, JSObject> m_resolveFunctionPrototype;
     LazyProperty<JSGlobalObject, JSObject> m_dnsObject;
     LazyProperty<JSGlobalObject, JSWeakMap> m_vmModuleContextMap;
     LazyProperty<JSGlobalObject, JSObject> m_lazyRequireCacheObject;
@@ -496,6 +499,10 @@ private:
     LazyProperty<JSGlobalObject, Structure> m_commonJSModuleObjectStructure;
     LazyProperty<JSGlobalObject, Structure> m_commonJSFunctionArgumentsStructure;
 
+    LazyProperty<JSGlobalObject, JSC::JSObject> m_importMetaRequireFunctionUnbound;
+    LazyProperty<JSGlobalObject, JSC::JSObject> m_importMetaRequireResolveFunctionUnbound;
+    LazyProperty<JSGlobalObject, JSC::Structure> m_importMetaObjectStructure;
+
     DOMGuardedObjectSet m_guardedObjects WTF_GUARDED_BY_LOCK(m_gcLock);
     void* m_bunVM;
 
diff --git a/src/bun.js/bindings/ZigSourceProvider.cpp b/src/bun.js/bindings/ZigSourceProvider.cpp
index ab3062cd5..a71e946de 100644
--- a/src/bun.js/bindings/ZigSourceProvider.cpp
+++ b/src/bun.js/bindings/ZigSourceProvider.cpp
@@ -43,39 +43,34 @@ static uintptr_t getSourceProviderMapKey(ResolvedSource& resolvedSource)
     }
 }
 
-Ref<SourceProvider> SourceProvider::create(Zig::GlobalObject* globalObject, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType)
+static SourceOrigin toSourceOrigin(const String& sourceURL, bool isBuiltin)
 {
-
-    uintptr_t providerKey = 0;
-    if (globalObject->isThreadLocalDefaultGlobalObject) {
-        auto& sourceProviderMap = globalObject->sourceProviderMap;
-        providerKey = getSourceProviderMapKey(resolvedSource);
-        if (providerKey) {
-            auto sourceProvider = sourceProviderMap.get(providerKey);
-            if (sourceProvider != nullptr) {
-                sourceProvider->ref();
-                return adoptRef(*reinterpret_cast<Zig::SourceProvider*>(sourceProvider));
-            }
+    if (isBuiltin) {
+        if (sourceURL.startsWith("node:"_s)) {
+            return SourceOrigin(WTF::URL(makeString("builtin://node/", sourceURL.substring(5))));
+        } else if (sourceURL.startsWith("bun:"_s)) {
+            return SourceOrigin(WTF::URL(makeString("builtin://bun/", sourceURL.substring(4))));
+        } else {
+            return SourceOrigin(WTF::URL(makeString("builtin://", sourceURL)));
         }
     }
+
+    return SourceOrigin(WTF::URL::fileURLWithFileSystemPath(sourceURL));
+}
+
+Ref<SourceProvider> SourceProvider::create(Zig::GlobalObject* globalObject, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType, bool isBuiltin)
+{
+
     auto stringImpl = Bun::toWTFString(resolvedSource.source_code);
     auto sourceURLString = toStringCopy(resolvedSource.source_url);
 
-    if (stringImpl.impl()->refCount() > 1)
-        // Deref because we don't call a destructor for BunString
-        stringImpl.impl()->deref();
-
     auto provider = adoptRef(*new SourceProvider(
         globalObject->isThreadLocalDefaultGlobalObject ? globalObject : nullptr,
         resolvedSource, stringImpl.releaseImpl().releaseNonNull(),
-        JSC::SourceOrigin(WTF::URL::fileURLWithFileSystemPath(sourceURLString)),
+        toSourceOrigin(sourceURLString, isBuiltin),
         sourceURLString.impl(), TextPosition(),
         sourceType));
 
-    if (providerKey) {
-        globalObject->sourceProviderMap.set(providerKey, provider.copyRef());
-    }
-
     return provider;
 }
 
@@ -90,11 +85,6 @@ unsigned SourceProvider::hash() const
 
 void SourceProvider::freeSourceCode()
 {
-    if (m_globalObjectForSourceProviderMap) {
-        m_globalObjectForSourceProviderMap->sourceProviderMap.remove((uintptr_t)m_source.get().characters8());
-    }
-
-    m_source = *WTF::StringImpl::empty();
 }
 
 void SourceProvider::updateCache(const UnlinkedFunctionExecutable* executable, const SourceCode&,
diff --git a/src/bun.js/bindings/ZigSourceProvider.h b/src/bun.js/bindings/ZigSourceProvider.h
index dd78b20ae..c189cc454 100644
--- a/src/bun.js/bindings/ZigSourceProvider.h
+++ b/src/bun.js/bindings/ZigSourceProvider.h
@@ -34,7 +34,7 @@ class SourceProvider final : public JSC::SourceProvider {
     using SourceOrigin = JSC::SourceOrigin;
 
 public:
-    static Ref<SourceProvider> create(Zig::GlobalObject*, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType = JSC::SourceProviderSourceType::Module);
+    static Ref<SourceProvider> create(Zig::GlobalObject*, ResolvedSource resolvedSource, JSC::SourceProviderSourceType sourceType = JSC::SourceProviderSourceType::Module, bool isBuiltIn = false);
     ~SourceProvider()
     {
         freeSourceCode();
diff --git a/src/bun.js/bindings/bindings.cpp b/src/bun.js/bindings/bindings.cpp
index 4eee81f4d..d311072e4 100644
--- a/src/bun.js/bindings/bindings.cpp
+++ b/src/bun.js/bindings/bindings.cpp
@@ -679,8 +679,8 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
             return false;
         }
 
-        JSC::PropertyNameArray a1(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Include);
-        JSC::PropertyNameArray a2(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Include);
+        JSC::PropertyNameArray a1(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Exclude);
+        JSC::PropertyNameArray a2(vm, PropertyNameMode::Symbols, PrivateSymbolMode::Exclude);
         JSObject::getOwnPropertyNames(o1, globalObject, a1, DontEnumPropertiesMode::Exclude);
         JSObject::getOwnPropertyNames(o2, globalObject, a2, DontEnumPropertiesMode::Exclude);
 
@@ -753,7 +753,7 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
             }
 
             o1Structure->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
-                if (entry.attributes() & PropertyAttribute::DontEnum) {
+                if (entry.attributes() & PropertyAttribute::DontEnum || PropertyName(entry.key()).isPrivateName()) {
                     return true;
                 }
                 count1++;
@@ -787,7 +787,7 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
             if (result && o2Structure->id() != o1Structure->id()) {
                 size_t remain = count1;
                 o2Structure->forEachProperty(vm, [&](const PropertyTableEntry& entry) -> bool {
-                    if (entry.attributes() & PropertyAttribute::DontEnum) {
+                    if (entry.attributes() & PropertyAttribute::DontEnum || PropertyName(entry.key()).isPrivateName()) {
                         return true;
                     }
 
@@ -815,8 +815,8 @@ bool Bun__deepEquals(JSC__JSGlobalObject* globalObject, JSValue v1, JSValue v2,
         }
     }
 
-    JSC::PropertyNameArray a1(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Include);
-    JSC::PropertyNameArray a2(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Include);
+    JSC::PropertyNameArray a1(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Exclude);
+    JSC::PropertyNameArray a2(vm, PropertyNameMode::StringsAndSymbols, PrivateSymbolMode::Exclude);
     o1->getPropertyNames(globalObject, a1, DontEnumPropertiesMode::Exclude);
     o2->getPropertyNames(globalObject, a2, DontEnumPropertiesMode::Exclude);
 
@@ -1279,15 +1279,14 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
     JSC__JSGlobalObject* globalObject)
 {
 
-    static const char* system_error_name = "SystemError";
     SystemError err = *arg0;
 
     JSC::VM& vm = globalObject->vm();
 
     auto scope = DECLARE_THROW_SCOPE(vm);
     JSC::JSValue message = JSC::jsUndefined();
-    if (err.message.len > 0) {
-        message = Zig::toJSString(err.message, globalObject);
+    if (err.message.tag != BunStringTag::Empty) {
+        message = Bun::toJS(globalObject, err.message);
     }
 
     JSC::JSValue options = JSC::jsUndefined();
@@ -1297,8 +1296,8 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
 
     auto clientData = WebCore::clientData(vm);
 
-    if (err.code.len > 0 && !(err.code.len == 1 and err.code.ptr[0] == 0)) {
-        JSC::JSValue code = Zig::toJSStringGC(err.code, globalObject);
+    if (err.code.tag != BunStringTag::Empty) {
+        JSC::JSValue code = Bun::toJS(globalObject, err.code);
         result->putDirect(vm, clientData->builtinNames().codePublicName(), code,
             JSC::PropertyAttribute::DontDelete | 0);
 
@@ -1307,13 +1306,12 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
 
         result->putDirect(
             vm, vm.propertyNames->name,
-            JSC::JSValue(JSC::jsOwnedString(
-                vm, WTF::String(WTF::StringImpl::createWithoutCopying(system_error_name, 11)))),
+            JSC::JSValue(jsString(vm, String("SystemError"_s))),
             JSC::PropertyAttribute::DontEnum | 0);
     }
 
-    if (err.path.len > 0) {
-        JSC::JSValue path = JSC::JSValue(Zig::toJSStringGC(err.path, globalObject));
+    if (err.path.tag != BunStringTag::Empty) {
+        JSC::JSValue path = Bun::toJS(globalObject, err.path);
         result->putDirect(vm, clientData->builtinNames().pathPublicName(), path,
             JSC::PropertyAttribute::DontDelete | 0);
     }
@@ -1324,8 +1322,8 @@ JSC__JSValue SystemError__toErrorInstance(const SystemError* arg0,
             JSC::PropertyAttribute::DontDelete | 0);
     }
 
-    if (err.syscall.len > 0) {
-        JSC::JSValue syscall = JSC::JSValue(Zig::toJSString(err.syscall, globalObject));
+    if (err.syscall.tag != BunStringTag::Empty) {
+        JSC::JSValue syscall = Bun::toJS(globalObject, err.syscall);
         result->putDirect(vm, clientData->builtinNames().syscallPublicName(), syscall,
             JSC::PropertyAttribute::DontDelete | 0);
     }
@@ -2593,6 +2591,12 @@ bool JSC__JSPromise__isHandled(const JSC__JSPromise* arg0, JSC__VM* arg1)
 {
     return arg0->isHandled(reinterpret_cast<JSC::VM&>(arg1));
 }
+void JSC__JSPromise__setHandled(JSC__JSPromise* promise, JSC__VM* arg1)
+{
+    auto& vm = *arg1;
+    auto flags = promise->internalField(JSC::JSPromise::Field::Flags).get().asUInt32();
+    promise->internalField(JSC::JSPromise::Field::Flags).set(vm, promise, jsNumber(flags | JSC::JSPromise::isHandledFlag));
+}
 
 #pragma mark - JSC::JSInternalPromise
 
@@ -2666,6 +2670,12 @@ bool JSC__JSInternalPromise__isHandled(const JSC__JSInternalPromise* arg0, JSC__
 {
     return arg0->isHandled(reinterpret_cast<JSC::VM&>(arg1));
 }
+void JSC__JSInternalPromise__setHandled(JSC__JSInternalPromise* promise, JSC__VM* arg1)
+{
+    auto& vm = *arg1;
+    auto flags = promise->internalField(JSC::JSPromise::Field::Flags).get().asUInt32();
+    promise->internalField(JSC::JSPromise::Field::Flags).set(vm, promise, jsNumber(flags | JSC::JSPromise::isHandledFlag));
+}
 
 #pragma mark - JSC::JSGlobalObject
 
@@ -2765,8 +2775,18 @@ void JSC__JSValue__put(JSC__JSValue JSValue0, JSC__JSGlobalObject* arg1, const Z
 
 bool JSC__JSValue__isClass(JSC__JSValue JSValue0, JSC__JSGlobalObject* arg1)
 {
-    JSC::JSValue value = JSC::JSValue::decode(JSValue0);
-    return value.isConstructor();
+    JSValue value = JSValue::decode(JSValue0);
+    auto callData = getCallData(value);
+
+    switch (callData.type) {
+    case CallData::Type::JS:
+        return callData.js.functionExecutable->isClassConstructorFunction();
+    case CallData::Type::Native:
+        if (callData.native.isBoundFunction)
+            return false;
+        return value.isConstructor();
+    }
+    return false;
 }
 bool JSC__JSValue__isCell(JSC__JSValue JSValue0) { return JSC::JSValue::decode(JSValue0).isCell(); }
 bool JSC__JSValue__isCustomGetterSetter(JSC__JSValue JSValue0)
@@ -3291,7 +3311,8 @@ bool JSC__JSValue__stringIncludes(JSC__JSValue value, JSC__JSGlobalObject* globa
 
 static void populateStackFrameMetadata(JSC::VM& vm, const JSC::StackFrame* stackFrame, ZigStackFrame* frame)
 {
-    frame->source_url = Zig::toZigString(stackFrame->sourceURL(vm));
+
+    frame->source_url = Bun::toStringRef(stackFrame->sourceURL(vm));
 
     if (stackFrame->isWasmFrame()) {
         frame->code_type = ZigStackFrameCodeWasm;
@@ -3328,37 +3349,11 @@ static void populateStackFrameMetadata(JSC::VM& vm, const JSC::StackFrame* stack
 
     JSC::JSObject* callee = JSC::jsCast<JSC::JSObject*>(calleeCell);
 
-    // Does the code block have a user-defined name property?
-    JSC::JSValue name = callee->getDirect(vm, vm.propertyNames->name);
-    if (name && name.isString()) {
-        auto str = name.toWTFString(m_codeBlock->globalObject());
-        frame->function_name = Zig::toZigString(str);
-        return;
-    }
-
-    /* For functions (either JSFunction or InternalFunction), fallback to their "native" name
-     * property. Based on JSC::getCalculatedDisplayName, "inlining" the
-     * JSFunction::calculatedDisplayName\InternalFunction::calculatedDisplayName calls */
-    if (JSC::JSFunction* function = JSC::jsDynamicCast<JSC::JSFunction*>(callee)) {
-
-        WTF::String actualName = function->name(vm);
-        if (!actualName.isEmpty() || function->isHostOrBuiltinFunction()) {
-            frame->function_name = Zig::toZigString(actualName);
-            return;
-        }
-
-        auto inferred_name = function->jsExecutable()->name();
-        frame->function_name = Zig::toZigString(inferred_name.string());
-    }
-
-    if (JSC::InternalFunction* function = JSC::jsDynamicCast<JSC::InternalFunction*>(callee)) {
-        // Based on JSC::InternalFunction::calculatedDisplayName, skipping the "displayName" property
-        frame->function_name = Zig::toZigString(function->name());
-    }
+    frame->function_name = Bun::toStringRef(JSC::getCalculatedDisplayName(vm, callee));
 }
 // Based on
 // https://github.com/mceSystems/node-jsc/blob/master/deps/jscshim/src/shim/JSCStackTrace.cpp#L298
-static void populateStackFramePosition(const JSC::StackFrame* stackFrame, ZigString* source_lines,
+static void populateStackFramePosition(const JSC::StackFrame* stackFrame, BunString* source_lines,
     int32_t* source_line_numbers, uint8_t source_lines_count,
     ZigStackFramePosition* position)
 {
@@ -3428,7 +3423,7 @@ static void populateStackFramePosition(const JSC::StackFrame* stackFrame, ZigStr
 
         // Most of the time, when you look at a stack trace, you want a couple lines above
 
-        source_lines[0] = { &chars[lineStart], lineStop - lineStart };
+        source_lines[0] = Bun::toStringRef(sourceString.substring(lineStart, lineStop - lineStart).toStringWithoutCopying());
         source_line_numbers[0] = line;
 
         if (lineStart > 0) {
@@ -3445,8 +3440,7 @@ static void populateStackFramePosition(const JSC::StackFrame* stackFrame, ZigStr
                 }
 
                 // We are at the beginning of the line
-                source_lines[source_line_i] = { &chars[byte_offset_in_source_string],
-                    end_of_line_offset - byte_offset_in_source_string + 1 };
+                source_lines[source_line_i] = Bun::toStringRef(sourceString.substring(byte_offset_in_source_string, end_of_line_offset - byte_offset_in_source_string + 1).toStringWithoutCopying());
 
                 source_line_numbers[source_line_i] = line - source_line_i;
                 source_line_i++;
@@ -3516,12 +3510,13 @@ static void fromErrorInstance(ZigException* except, JSC::JSGlobalObject* global,
     JSC::JSValue val)
 {
     JSC::JSObject* obj = JSC::jsDynamicCast<JSC::JSObject*>(val);
+    JSC::VM& vm = global->vm();
 
     bool getFromSourceURL = false;
     if (stackTrace != nullptr && stackTrace->size() > 0) {
-        populateStackTrace(global->vm(), *stackTrace, &except->stack);
+        populateStackTrace(vm, *stackTrace, &except->stack);
     } else if (err->stackTrace() != nullptr && err->stackTrace()->size() > 0) {
-        populateStackTrace(global->vm(), *err->stackTrace(), &except->stack);
+        populateStackTrace(vm, *err->stackTrace(), &except->stack);
     } else {
         getFromSourceURL = true;
     }
@@ -3533,33 +3528,35 @@ static void fromErrorInstance(ZigException* except, JSC::JSGlobalObject* global,
         except->code = 8;
     }
     if (except->code == SYNTAX_ERROR_CODE) {
-        except->message = Zig::toZigString(err->sanitizedMessageString(global));
-    } else if (JSC::JSValue message = obj->getIfPropertyExists(global, global->vm().propertyNames->message)) {
+        except->message = Bun::toStringRef(err->sanitizedMessageString(global));
+    } else if (JSC::JSValue message = obj->getIfPropertyExists(global, vm.propertyNames->message)) {
 
-        except->message = Zig::toZigString(message, global);
+        except->message = Bun::toStringRef(global, message);
 
     } else {
-        except->message = Zig::toZigString(err->sanitizedMessageString(global));
+        except->message = Bun::toStringRef(err->sanitizedMessageString(global));
     }
-    except->name = Zig::toZigString(err->sanitizedNameString(global));
+
+    except->name = Bun::toStringRef(err->sanitizedNameString(global));
+
     except->runtime_type = err->runtimeTypeForCause();
 
-    auto clientData = WebCore::clientData(global->vm());
+    auto clientData = WebCore::clientData(vm);
     if (except->code != SYNTAX_ERROR_CODE) {
 
         if (JSC::JSValue syscall = obj->getIfPropertyExists(global, clientData->builtinNames().syscallPublicName())) {
-            except->syscall = Zig::toZigString(syscall, global);
+            except->syscall = Bun::toStringRef(global, syscall);
         }
 
         if (JSC::JSValue code = obj->getIfPropertyExists(global, clientData->builtinNames().codePublicName())) {
-            except->code_ = Zig::toZigString(code, global);
+            except->code_ = Bun::toStringRef(global, code);
         }
 
         if (JSC::JSValue path = obj->getIfPropertyExists(global, clientData->builtinNames().pathPublicName())) {
-            except->path = Zig::toZigString(path, global);
+            except->path = Bun::toStringRef(global, path);
         }
 
-        if (JSC::JSValue fd = obj->getIfPropertyExists(global, Identifier::fromString(global->vm(), "fd"_s))) {
+        if (JSC::JSValue fd = obj->getIfPropertyExists(global, Identifier::fromString(vm, "fd"_s))) {
             if (fd.isAnyInt()) {
                 except->fd = fd.toInt32(global);
             }
@@ -3571,27 +3568,29 @@ static void fromErrorInstance(ZigException* except, JSC::JSGlobalObject* global,
     }
 
     if (getFromSourceURL) {
-        if (JSC::JSValue sourceURL = obj->getIfPropertyExists(global, global->vm().propertyNames->sourceURL)) {
-            except->stack.frames_ptr[0].source_url = Zig::toZigString(sourceURL, global);
+        if (JSC::JSValue sourceURL = obj->getIfPropertyExists(global, vm.propertyNames->sourceURL)) {
+            except->stack.frames_ptr[0].source_url = Bun::toStringRef(global, sourceURL);
 
-            if (JSC::JSValue column = obj->getIfPropertyExists(global, global->vm().propertyNames->column)) {
+            if (JSC::JSValue column = obj->getIfPropertyExists(global, vm.propertyNames->column)) {
                 except->stack.frames_ptr[0].position.column_start = column.toInt32(global);
             }
 
-            if (JSC::JSValue line = obj->getIfPropertyExists(global, global->vm().propertyNames->line)) {
+            if (JSC::JSValue line = obj->getIfPropertyExists(global, vm.propertyNames->line)) {
                 except->stack.frames_ptr[0].position.line = line.toInt32(global);
 
-                if (JSC::JSValue lineText = obj->getIfPropertyExists(global, JSC::Identifier::fromString(global->vm(), "lineText"_s))) {
+                if (JSC::JSValue lineText = obj->getIfPropertyExists(global, JSC::Identifier::fromString(vm, "lineText"_s))) {
                     if (JSC::JSString* jsStr = lineText.toStringOrNull(global)) {
                         auto str = jsStr->value(global);
-                        except->stack.source_lines_ptr[0] = Zig::toZigString(str);
+                        except->stack.source_lines_ptr[0] = Bun::toStringRef(str);
                         except->stack.source_lines_numbers[0] = except->stack.frames_ptr[0].position.line;
                         except->stack.source_lines_len = 1;
                         except->remapped = true;
                     }
                 }
             }
+
             except->stack.frames_len = 1;
+            except->stack.frames_ptr[0].remapped = obj->hasProperty(global, JSC::Identifier::fromString(vm, "originalLine"_s));
         }
     }
 
@@ -3605,7 +3604,7 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
     if (JSC::JSObject* obj = JSC::jsDynamicCast<JSC::JSObject*>(value)) {
         if (obj->hasProperty(global, global->vm().propertyNames->name)) {
             auto name_str = obj->getIfPropertyExists(global, global->vm().propertyNames->name).toWTFString(global);
-            except->name = Zig::toZigString(name_str);
+            except->name = Bun::toStringRef(name_str);
             if (name_str == "Error"_s) {
                 except->code = JSErrorCodeError;
             } else if (name_str == "EvalError"_s) {
@@ -3627,14 +3626,14 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
 
         if (JSC::JSValue message = obj->getIfPropertyExists(global, global->vm().propertyNames->message)) {
             if (message) {
-                except->message = Zig::toZigString(
+                except->message = Bun::toStringRef(
                     message.toWTFString(global));
             }
         }
 
         if (JSC::JSValue sourceURL = obj->getIfPropertyExists(global, global->vm().propertyNames->sourceURL)) {
             if (sourceURL) {
-                except->stack.frames_ptr[0].source_url = Zig::toZigString(
+                except->stack.frames_ptr[0].source_url = Bun::toStringRef(
                     sourceURL.toWTFString(global));
                 except->stack.frames_len = 1;
             }
@@ -3642,7 +3641,12 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
 
         if (JSC::JSValue line = obj->getIfPropertyExists(global, global->vm().propertyNames->line)) {
             if (line) {
-                except->stack.frames_ptr[0].position.line = line.toInt32(global);
+                // TODO: don't sourcemap it twice
+                if (auto originalLine = obj->getIfPropertyExists(global, JSC::Identifier::fromString(global->vm(), "originalLine"_s))) {
+                    except->stack.frames_ptr[0].position.line = originalLine.toInt32(global);
+                } else {
+                    except->stack.frames_ptr[0].position.line = line.toInt32(global);
+                }
                 except->stack.frames_len = 1;
             }
         }
@@ -3658,9 +3662,7 @@ void exceptionFromString(ZigException* except, JSC::JSValue value, JSC::JSGlobal
     }
     scope.release();
 
-    auto ref = OpaqueJSString::tryCreate(str);
-    except->message = ZigString { ref->characters8(), ref->length() };
-    ref->ref();
+    except->message = Bun::toStringRef(str);
 }
 
 void JSC__VM__releaseWeakRefs(JSC__VM* arg0)
@@ -3770,8 +3772,8 @@ void JSC__JSValue__toZigException(JSC__JSValue JSValue0, JSC__JSGlobalObject* ar
     JSC::JSValue value = JSC::JSValue::decode(JSValue0);
     if (value == JSC::JSValue {}) {
         exception->code = JSErrorCodeError;
-        exception->name = Zig::toZigString("Error"_s);
-        exception->message = Zig::toZigString("Unknown error"_s);
+        exception->name = Bun::toStringRef("Error"_s);
+        exception->message = Bun::toStringRef("Unknown error"_s);
         return;
     }
 
@@ -3900,36 +3902,6 @@ void JSC__VM__throwError(JSC__VM* vm_, JSC__JSGlobalObject* arg1, JSC__JSValue v
     scope.throwException(arg1, exception);
 }
 
-#pragma mark - JSC::ThrowScope
-
-void JSC__ThrowScope__clearException(JSC__ThrowScope* arg0)
-{
-    arg0->clearException();
-};
-bJSC__ThrowScope JSC__ThrowScope__declare(JSC__VM* arg0, unsigned char* arg1, unsigned char* arg2,
-    size_t arg3)
-{
-    Wrap<JSC::ThrowScope, bJSC__ThrowScope> wrapped = Wrap<JSC::ThrowScope, bJSC__ThrowScope>();
-    wrapped.cpp = new (wrapped.alignedBuffer()) JSC::ThrowScope(reinterpret_cast<JSC::VM&>(arg0));
-    return wrapped.result;
-};
-JSC__Exception* JSC__ThrowScope__exception(JSC__ThrowScope* arg0) { return arg0->exception(); }
-void JSC__ThrowScope__release(JSC__ThrowScope* arg0) { arg0->release(); }
-
-#pragma mark - JSC::CatchScope
-
-void JSC__CatchScope__clearException(JSC__CatchScope* arg0)
-{
-    arg0->clearException();
-}
-bJSC__CatchScope JSC__CatchScope__declare(JSC__VM* arg0, unsigned char* arg1, unsigned char* arg2,
-    size_t arg3)
-{
-    JSC::CatchScope scope = JSC::CatchScope(reinterpret_cast<JSC::VM&>(arg0));
-    return cast<bJSC__CatchScope>(&scope);
-}
-JSC__Exception* JSC__CatchScope__exception(JSC__CatchScope* arg0) { return arg0->exception(); }
-
 JSC__JSValue JSC__JSPromise__rejectedPromiseValue(JSC__JSGlobalObject* arg0,
     JSC__JSValue JSValue1)
 {
@@ -4091,9 +4063,18 @@ restart:
             if (key.len == 0)
                 return true;
 
-            JSC::JSValue propertyValue = objectToUse == object ? objectToUse->getDirect(entry.offset()) : JSValue();
+            JSC::JSValue propertyValue = JSValue();
+
+            if (objectToUse == object) {
+                propertyValue = objectToUse->getDirect(entry.offset());
+                if (!propertyValue) {
+                    scope.clearException();
+                    return true;
+                }
+            }
+
             if (!propertyValue || propertyValue.isGetterSetter() && !((entry.attributes() & PropertyAttribute::Accessor) != 0)) {
-                propertyValue = objectToUse->get(globalObject, prop);
+                propertyValue = objectToUse->getIfPropertyExists(globalObject, prop);
             }
 
             if (scope.exception())
diff --git a/src/bun.js/bindings/bindings.zig b/src/bun.js/bindings/bindings.zig
index 35c9d26fa..7e3fa6d8e 100644
--- a/src/bun.js/bindings/bindings.zig
+++ b/src/bun.js/bindings/bindings.zig
@@ -291,7 +291,27 @@ pub const ZigString = extern struct {
         return this.len * 2;
     }
 
-    /// Count the number of code points in the string.
+    pub fn utf16ByteLength(this: ZigString) usize {
+        if (this.isUTF8()) {
+            return bun.simdutf.length.utf16.from.utf8.le(this.slice());
+        }
+
+        if (this.is16Bit()) {
+            return this.len * 2;
+        }
+
+        return JSC.WebCore.Encoder.byteLengthU8(this.slice().ptr, this.slice().len, .utf16le);
+    }
+
+    pub fn latin1ByteLength(this: ZigString) usize {
+        if (this.isUTF8()) {
+            @panic("TODO");
+        }
+
+        return this.len;
+    }
+
+    /// Count the number of bytes in the UTF-8 version of the string.
     /// This function is slow. Use maxUITF8ByteLength() to get a quick estimate
     pub fn utf8ByteLength(this: ZigString) usize {
         if (this.isUTF8()) {
@@ -370,11 +390,11 @@ pub const ZigString = extern struct {
     }
 
     pub fn markStatic(this: *ZigString) void {
-        this.ptr = @intToPtr([*]const u8, @ptrToInt(this.ptr) | (1 << 60));
+        this.ptr = @ptrFromInt([*]const u8, @intFromPtr(this.ptr) | (1 << 60));
     }
 
     pub fn isStatic(this: *const ZigString) bool {
-        return @ptrToInt(this.ptr) & (1 << 60) != 0;
+        return @intFromPtr(this.ptr) & (1 << 60) != 0;
     }
 
     pub const Slice = struct {
@@ -483,7 +503,7 @@ pub const ZigString = extern struct {
         }
 
         pub fn mut(this: Slice) []u8 {
-            return @intToPtr([*]u8, @ptrToInt(this.ptr))[0..this.len];
+            return @ptrFromInt([*]u8, @intFromPtr(this.ptr))[0..this.len];
         }
 
         /// Does nothing if the slice is not allocated
@@ -504,7 +524,7 @@ pub const ZigString = extern struct {
     pub const namespace = "";
 
     pub inline fn is16Bit(this: *const ZigString) bool {
-        return (@ptrToInt(this._unsafe_ptr_do_not_use) & (1 << 63)) != 0;
+        return (@intFromPtr(this._unsafe_ptr_do_not_use) & (1 << 63)) != 0;
     }
 
     pub inline fn utf16Slice(this: *const ZigString) []align(1) const u16 {
@@ -539,7 +559,7 @@ pub const ZigString = extern struct {
     }
 
     pub fn sortDesc(slice_: []ZigString) void {
-        std.sort.sort(ZigString, slice_, {}, cmpDesc);
+        std.sort.block(ZigString, slice_, {}, cmpDesc);
     }
 
     pub fn cmpDesc(_: void, a: ZigString, b: ZigString) bool {
@@ -547,7 +567,7 @@ pub const ZigString = extern struct {
     }
 
     pub fn sortAsc(slice_: []ZigString) void {
-        std.sort.sort(ZigString, slice_, {}, cmpAsc);
+        std.sort.block(ZigString, slice_, {}, cmpAsc);
     }
 
     pub fn cmpAsc(_: void, a: ZigString, b: ZigString) bool {
@@ -641,15 +661,15 @@ pub const ZigString = extern struct {
     }
 
     pub fn isUTF8(this: ZigString) bool {
-        return (@ptrToInt(this._unsafe_ptr_do_not_use) & (1 << 61)) != 0;
+        return (@intFromPtr(this._unsafe_ptr_do_not_use) & (1 << 61)) != 0;
     }
 
     pub fn markUTF8(this: *ZigString) void {
-        this._unsafe_ptr_do_not_use = @intToPtr([*]const u8, @ptrToInt(this._unsafe_ptr_do_not_use) | (1 << 61));
+        this._unsafe_ptr_do_not_use = @ptrFromInt([*]const u8, @intFromPtr(this._unsafe_ptr_do_not_use) | (1 << 61));
     }
 
     pub fn markUTF16(this: *ZigString) void {
-        this._unsafe_ptr_do_not_use = @intToPtr([*]const u8, @ptrToInt(this._unsafe_ptr_do_not_use) | (1 << 63));
+        this._unsafe_ptr_do_not_use = @ptrFromInt([*]const u8, @intFromPtr(this._unsafe_ptr_do_not_use) | (1 << 63));
     }
 
     pub fn setOutputEncoding(this: *ZigString) void {
@@ -658,7 +678,7 @@ pub const ZigString = extern struct {
     }
 
     pub inline fn isGloballyAllocated(this: ZigString) bool {
-        return (@ptrToInt(this._unsafe_ptr_do_not_use) & (1 << 62)) != 0;
+        return (@intFromPtr(this._unsafe_ptr_do_not_use) & (1 << 62)) != 0;
     }
 
     pub inline fn deinitGlobal(this: ZigString) void {
@@ -668,7 +688,7 @@ pub const ZigString = extern struct {
     pub const mark = markGlobal;
 
     pub inline fn markGlobal(this: *ZigString) void {
-        this._unsafe_ptr_do_not_use = @intToPtr([*]const u8, @ptrToInt(this._unsafe_ptr_do_not_use) | (1 << 62));
+        this._unsafe_ptr_do_not_use = @ptrFromInt([*]const u8, @intFromPtr(this._unsafe_ptr_do_not_use) | (1 << 62));
     }
 
     pub fn format(self: ZigString, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
@@ -694,7 +714,7 @@ pub const ZigString = extern struct {
     inline fn untagged(ptr: [*]const u8) [*]const u8 {
         // this can be null ptr, so long as it's also a 0 length string
         @setRuntimeSafety(false);
-        return @intToPtr([*]const u8, @truncate(u53, @ptrToInt(ptr)));
+        return @ptrFromInt([*]const u8, @truncate(u53, @intFromPtr(ptr)));
     }
 
     pub fn slice(this: *const ZigString) []const u8 {
@@ -1303,7 +1323,7 @@ pub const FetchHeaders = opaque {
         this: *FetchHeaders,
         name_: HTTPHeaderName,
     ) bool {
-        return fastHas_(this, @enumToInt(name_));
+        return fastHas_(this, @intFromEnum(name_));
     }
 
     pub fn fastGet(
@@ -1311,7 +1331,7 @@ pub const FetchHeaders = opaque {
         name_: HTTPHeaderName,
     ) ?ZigString {
         var str = ZigString.init("");
-        fastGet_(this, @enumToInt(name_), &str);
+        fastGet_(this, @intFromEnum(name_), &str);
         if (str.len == 0) {
             return null;
         }
@@ -1441,7 +1461,7 @@ pub const FetchHeaders = opaque {
         this: *FetchHeaders,
         header: HTTPHeaderName,
     ) void {
-        return fastRemove_(this, @enumToInt(header));
+        return fastRemove_(this, @intFromEnum(header));
     }
 
     pub fn fastRemove_(
@@ -1561,10 +1581,10 @@ pub const FetchHeaders = opaque {
 pub const SystemError = extern struct {
     errno: c_int = 0,
     /// label for errno
-    code: ZigString = ZigString.init(""),
-    message: ZigString = ZigString.init(""),
-    path: ZigString = ZigString.init(""),
-    syscall: ZigString = ZigString.init(""),
+    code: String = String.empty,
+    message: String = String.empty,
+    path: String = String.empty,
+    syscall: String = String.empty,
     fd: i32 = -1,
 
     pub fn Maybe(comptime Result: type) type {
@@ -1611,11 +1631,11 @@ pub const Sizes = @import("../bindings/sizes.zig");
 pub const JSUint8Array = opaque {
     pub const name = "Uint8Array_alias";
     pub fn ptr(this: *JSUint8Array) [*]u8 {
-        return @intToPtr(*[*]u8, @ptrToInt(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayVector).*;
+        return @ptrFromInt(*[*]u8, @intFromPtr(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayVector).*;
     }
 
     pub fn len(this: *JSUint8Array) usize {
-        return @intToPtr(*usize, @ptrToInt(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayLength).*;
+        return @ptrFromInt(*usize, @intFromPtr(this) + Sizes.Bun_FFI_PointerOffsetToTypedArrayLength).*;
     }
 
     pub fn slice(this: *JSUint8Array) []u8 {
@@ -2045,6 +2065,9 @@ pub const JSPromise = extern struct {
     pub fn isHandled(this: *const JSPromise, vm: *VM) bool {
         return cppFn("isHandled", .{ this, vm });
     }
+    pub fn setHandled(this: *JSPromise, vm: *VM) void {
+        cppFn("setHandled", .{ this, vm });
+    }
 
     pub fn rejectWithCaughtException(this: *JSPromise, globalObject: *JSGlobalObject, scope: ThrowScope) void {
         return cppFn("rejectWithCaughtException", .{ this, globalObject, scope });
@@ -2115,6 +2138,7 @@ pub const JSPromise = extern struct {
         "asValue",
         "create",
         "isHandled",
+        "setHandled",
         "reject",
         "rejectAsHandled",
         "rejectAsHandledException",
@@ -2149,6 +2173,9 @@ pub const JSInternalPromise = extern struct {
     pub fn isHandled(this: *const JSInternalPromise, vm: *VM) bool {
         return cppFn("isHandled", .{ this, vm });
     }
+    pub fn setHandled(this: *JSInternalPromise, vm: *VM) void {
+        cppFn("setHandled", .{ this, vm });
+    }
 
     pub fn rejectWithCaughtException(this: *JSInternalPromise, globalObject: *JSGlobalObject, scope: ThrowScope) void {
         return cppFn("rejectWithCaughtException", .{ this, globalObject, scope });
@@ -2332,6 +2359,7 @@ pub const JSInternalPromise = extern struct {
         "status",
         "result",
         "isHandled",
+        "setHandled",
         "resolvedPromise",
         "rejectedPromise",
         "resolve",
@@ -2363,6 +2391,11 @@ pub const AnyPromise = union(enum) {
             inline else => |promise| promise.isHandled(vm),
         };
     }
+    pub fn setHandled(this: AnyPromise, vm: *VM) void {
+        switch (this) {
+            inline else => |promise| promise.setHandled(vm),
+        }
+    }
 
     pub fn rejectWithCaughtException(this: AnyPromise, globalObject: *JSGlobalObject, scope: ThrowScope) void {
         switch (this) {
@@ -2699,6 +2732,23 @@ pub const JSGlobalObject = extern struct {
             this.vm().throwError(this, this.createErrorInstance(Output.prettyFmt(fmt, false), args));
         }
     }
+    extern fn JSC__JSGlobalObject__queueMicrotaskCallback(*JSGlobalObject, *anyopaque, Function: *const (fn (*anyopaque) callconv(.C) void)) void;
+    pub fn queueMicrotaskCallback(
+        this: *JSGlobalObject,
+        ctx_val: anytype,
+        comptime Function: fn (ctx: @TypeOf(ctx_val)) void,
+    ) void {
+        JSC.markBinding(@src());
+        const Fn = Function;
+        const ContextType = @TypeOf(ctx_val);
+        const Wrapper = struct {
+            pub fn call(p: *anyopaque) callconv(.C) void {
+                Fn(bun.cast(ContextType, p));
+            }
+        };
+
+        JSC__JSGlobalObject__queueMicrotaskCallback(this, ctx_val, &Wrapper.call);
+    }
 
     pub fn queueMicrotask(
         this: *JSGlobalObject,
@@ -3156,7 +3206,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
         pub fn isObject(this: JSType) bool {
             // inline constexpr bool isObjectType(JSType type) { return type >= ObjectType; }
-            return @enumToInt(this) >= @enumToInt(JSType.Object);
+            return @intFromEnum(this) >= @intFromEnum(JSType.Object);
         }
 
         pub fn isFunction(this: JSType) bool {
@@ -3311,7 +3361,7 @@ pub const JSValue = enum(JSValueReprInt) {
     };
 
     pub inline fn cast(ptr: anytype) JSValue {
-        return @intToEnum(JSValue, @bitCast(i64, @ptrToInt(ptr)));
+        return @enumFromInt(JSValue, @bitCast(i64, @intFromPtr(ptr)));
     }
 
     pub fn coerceToInt32(this: JSValue, globalThis: *JSC.JSGlobalObject) i32 {
@@ -3395,6 +3445,7 @@ pub const JSValue = enum(JSValueReprInt) {
             c_int => @intCast(c_int, toInt32(this)),
             ?AnyPromise => asAnyPromise(this),
             u52 => @truncate(u52, @intCast(u64, @max(this.toInt64(), 0))),
+            i52 => @truncate(i52, @intCast(i52, this.toInt64())),
             u64 => toUInt64NoTruncate(this),
             u8 => @truncate(u8, toU32(this)),
             i16 => @truncate(i16, toInt32(this)),
@@ -3808,7 +3859,7 @@ pub const JSValue = enum(JSValueReprInt) {
             return jsNumberFromInt32(@intCast(i32, i));
         }
 
-        return jsNumberFromDouble(@intToFloat(f64, @truncate(i52, i)));
+        return jsNumberFromDouble(@floatFromInt(f64, @truncate(i52, i)));
     }
 
     pub inline fn toJS(this: JSValue, _: *const JSGlobalObject) JSValue {
@@ -3820,7 +3871,7 @@ pub const JSValue = enum(JSValueReprInt) {
             return jsNumberFromInt32(@intCast(i32, i));
         }
 
-        return jsNumberFromDouble(@intToFloat(f64, @intCast(i52, @truncate(u51, i))));
+        return jsNumberFromDouble(@floatFromInt(f64, @intCast(i52, @truncate(u51, i))));
     }
 
     pub fn coerceDoubleTruncatingIntoInt64(this: JSValue) i64 {
@@ -3834,7 +3885,7 @@ pub const JSValue = enum(JSValueReprInt) {
             return if (double_value < 0) @as(i64, std.math.minInt(i64)) else @as(i64, std.math.maxInt(i64));
         }
 
-        return @floatToInt(
+        return @intFromFloat(
             i64,
             double_value,
         );
@@ -3871,26 +3922,26 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn isUndefined(this: JSValue) bool {
-        return @enumToInt(this) == 0xa;
+        return @intFromEnum(this) == 0xa;
     }
     pub inline fn isNull(this: JSValue) bool {
-        return @enumToInt(this) == 0x2;
+        return @intFromEnum(this) == 0x2;
     }
     pub inline fn isEmptyOrUndefinedOrNull(this: JSValue) bool {
-        return switch (@enumToInt(this)) {
+        return switch (@intFromEnum(this)) {
             0, 0xa, 0x2 => true,
             else => false,
         };
     }
     pub fn isUndefinedOrNull(this: JSValue) bool {
-        return switch (@enumToInt(this)) {
+        return switch (@intFromEnum(this)) {
             0xa, 0x2 => true,
             else => false,
         };
     }
     /// Empty as in "JSValue {}" rather than an empty string
     pub inline fn isEmpty(this: JSValue) bool {
-        return switch (@enumToInt(this)) {
+        return switch (@intFromEnum(this)) {
             0 => true,
             else => false,
         };
@@ -4016,7 +4067,7 @@ pub const JSValue = enum(JSValueReprInt) {
     pub inline fn isCell(this: JSValue) bool {
         return switch (this) {
             .zero, .undefined, .null, .true, .false => false,
-            else => (@bitCast(u64, @enumToInt(this)) & FFI.NotCellMask) == 0,
+            else => (@bitCast(u64, @intFromEnum(this)) & FFI.NotCellMask) == 0,
         };
     }
 
@@ -4179,7 +4230,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     // intended to be more lightweight than ZigString
     pub fn fastGet(this: JSValue, global: *JSGlobalObject, builtin_name: BuiltinName) ?JSValue {
-        const result = fastGet_(this, global, @enumToInt(builtin_name));
+        const result = fastGet_(this, global, @intFromEnum(builtin_name));
         if (result == .zero) {
             return null;
         }
@@ -4188,7 +4239,7 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub fn fastGetDirect(this: JSValue, global: *JSGlobalObject, builtin_name: BuiltinName) ?JSValue {
-        const result = fastGetDirect_(this, global, @enumToInt(builtin_name));
+        const result = fastGetDirect_(this, global, @intFromEnum(builtin_name));
         if (result == .zero) {
             return null;
         }
@@ -4243,7 +4294,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     pub fn get(this: JSValue, global: *JSGlobalObject, property: []const u8) ?JSValue {
         const value = getIfPropertyExistsImpl(this, global, property.ptr, @intCast(u32, property.len));
-        return if (@enumToInt(value) != 0) value else return null;
+        return if (@intFromEnum(value) != 0) value else return null;
     }
 
     pub fn implementsToString(this: JSValue, global: *JSGlobalObject) bool {
@@ -4407,7 +4458,7 @@ pub const JSValue = enum(JSValueReprInt) {
     /// This algorithm differs from the IsStrictlyEqual Algorithm by treating all NaN values as equivalent and by differentiating +0𝔽 from -0𝔽.
     /// https://tc39.es/ecma262/#sec-samevalue
     pub fn isSameValue(this: JSValue, other: JSValue, global: *JSGlobalObject) bool {
-        return @enumToInt(this) == @enumToInt(other) or cppFn("isSameValue", .{ this, other, global });
+        return @intFromEnum(this) == @intFromEnum(other) or cppFn("isSameValue", .{ this, other, global });
     }
 
     pub fn deepEquals(this: JSValue, other: JSValue, global: *JSGlobalObject) bool {
@@ -4460,7 +4511,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     /// Get the internal number of the `JSC::DateInstance` object
     /// Returns NaN if the value is not a `JSC::DateInstance` (`Date` in JS)
-     pub fn getUnixTimestamp(this: JSValue) f64 {
+    pub fn getUnixTimestamp(this: JSValue) f64 {
         return cppFn("getUnixTimestamp", .{
             this,
         });
@@ -4492,7 +4543,7 @@ pub const JSValue = enum(JSValueReprInt) {
 
     pub fn asNumber(this: JSValue) f64 {
         if (this.isInt32()) {
-            return @intToFloat(f64, this.asInt32());
+            return @floatFromInt(f64, this.asInt32());
         }
 
         if (isNumber(this)) {
@@ -4515,19 +4566,19 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub fn asPtr(this: JSValue, comptime Pointer: type) *Pointer {
-        return @intToPtr(*Pointer, this.asPtrAddress());
+        return @ptrFromInt(*Pointer, this.asPtrAddress());
     }
 
     pub fn fromPtrAddress(addr: anytype) JSValue {
-        return jsNumber(@intToFloat(f64, @bitCast(usize, @as(usize, addr))));
+        return jsNumber(@floatFromInt(f64, @bitCast(usize, @as(usize, addr))));
     }
 
     pub fn asPtrAddress(this: JSValue) usize {
-        return @bitCast(usize, @floatToInt(usize, this.asDouble()));
+        return @bitCast(usize, @intFromFloat(usize, this.asDouble()));
     }
 
     pub fn fromPtr(addr: anytype) JSValue {
-        return fromPtrAddress(@ptrToInt(addr));
+        return fromPtrAddress(@intFromPtr(addr));
     }
 
     pub fn toBooleanSlow(this: JSValue, global: *JSGlobalObject) bool {
@@ -4546,13 +4597,20 @@ pub const JSValue = enum(JSValueReprInt) {
         return FFI.JSVALUE_TO_BOOL(.{ .asJSValue = this });
     }
 
+    pub inline fn asInt52(this: JSValue) i64 {
+        if (comptime bun.Environment.allow_assert) {
+            std.debug.assert(this.isNumber());
+        }
+        return @intFromFloat(i64, @max(@min(this.asDouble(), std.math.maxInt(i52)), std.math.minInt(i52)));
+    }
+
     pub fn toInt32(this: JSValue) i32 {
         if (this.isInt32()) {
             return asInt32(this);
         }
 
         if (this.isNumber()) {
-            return @truncate(i32, @floatToInt(i64, asDouble(this)));
+            return @truncate(i32, this.asInt52());
         }
 
         if (comptime bun.Environment.allow_assert) {
@@ -4570,11 +4628,11 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn toU16(this: JSValue) u16 {
-        return @truncate(u16, this.toU32());
+        return @truncate(u16, @max(this.toInt32(), 0));
     }
 
     pub inline fn toU32(this: JSValue) u32 {
-        return @intCast(u32, @max(this.toInt32(), 0));
+        return @intCast(u32, @min(@max(this.toInt64(), 0), std.math.maxInt(u32)));
     }
 
     /// This function supports:
@@ -4591,11 +4649,11 @@ pub const JSValue = enum(JSValueReprInt) {
     /// If the "length" property does not exist, this function will return 0.
     pub fn getLength(this: JSValue, globalThis: *JSGlobalObject) u64 {
         const len = this.getLengthIfPropertyExistsInternal(globalThis);
-        if (len == std.math.f64_max) {
+        if (len == std.math.floatMax(f64)) {
             return 0;
         }
 
-        return @floatToInt(u64, @max(len, 0));
+        return @intFromFloat(u64, @max(@min(len, std.math.maxInt(i52)), 0));
     }
 
     /// This function supports:
@@ -4612,11 +4670,11 @@ pub const JSValue = enum(JSValueReprInt) {
     /// If the "length" property does not exist, this function will return null.
     pub fn tryGetLength(this: JSValue, globalThis: *JSGlobalObject) ?f64 {
         const len = this.getLengthIfPropertyExistsInternal(globalThis);
-        if (len == std.math.f64_max) {
+        if (len == std.math.floatMax(f64)) {
             return null;
         }
 
-        return @floatToInt(u64, @max(len, 0));
+        return @intFromFloat(u64, @max(@min(len, std.math.maxInt(i52)), 0));
     }
 
     /// Do not use this directly!
@@ -4661,15 +4719,15 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn asRef(this: JSValue) C_API.JSValueRef {
-        return @intToPtr(C_API.JSValueRef, @bitCast(usize, @enumToInt(this)));
+        return @ptrFromInt(C_API.JSValueRef, @bitCast(usize, @intFromEnum(this)));
     }
 
     pub inline fn c(this: C_API.JSValueRef) JSValue {
-        return @intToEnum(JSValue, @bitCast(JSValue.Type, @ptrToInt(this)));
+        return @enumFromInt(JSValue, @bitCast(JSValue.Type, @intFromPtr(this)));
     }
 
     pub inline fn fromRef(this: C_API.JSValueRef) JSValue {
-        return @intToEnum(JSValue, @bitCast(JSValue.Type, @ptrToInt(this)));
+        return @enumFromInt(JSValue, @bitCast(JSValue.Type, @intFromPtr(this)));
     }
 
     pub inline fn asObjectRef(this: JSValue) C_API.JSObjectRef {
@@ -4685,12 +4743,12 @@ pub const JSValue = enum(JSValueReprInt) {
     }
 
     pub inline fn asNullableVoid(this: JSValue) ?*anyopaque {
-        return @intToPtr(?*anyopaque, @bitCast(usize, @enumToInt(this)));
+        return @ptrFromInt(?*anyopaque, @bitCast(usize, @intFromEnum(this)));
     }
 
     pub inline fn asVoid(this: JSValue) *anyopaque {
         if (comptime bun.Environment.allow_assert) {
-            if (@enumToInt(this) == 0) {
+            if (@intFromEnum(this) == 0) {
                 @panic("JSValue is null");
             }
         }
@@ -4857,7 +4915,7 @@ pub const Exception = extern struct {
     pub fn create(globalObject: *JSGlobalObject, object: *JSObject, stack_capture: StackCaptureAction) *Exception {
         return cppFn(
             "create",
-            .{ globalObject, object, @enumToInt(stack_capture) },
+            .{ globalObject, object, @intFromEnum(stack_capture) },
         );
     }
 
@@ -4893,7 +4951,7 @@ pub const VM = extern struct {
         LargeHeap = 1,
     };
     pub fn create(heap_type: HeapType) *VM {
-        return cppFn("create", .{@enumToInt(heap_type)});
+        return cppFn("create", .{@intFromEnum(heap_type)});
     }
 
     pub fn deinit(vm: *VM, global_object: *JSGlobalObject) void {
@@ -5157,16 +5215,16 @@ pub const CallFrame = opaque {
     pub fn arguments(self: *const CallFrame, comptime max: usize) Arguments(max) {
         const len = self.argumentsCount();
         var ptr = self.argumentsPtr();
-        return switch (@min(len, max)) {
+        return switch (@as(u4, @min(len, max))) {
             0 => .{ .ptr = undefined, .len = 0 },
-            1 => Arguments(max).init(1, ptr),
-            2 => Arguments(max).init(@min(2, max), ptr),
-            3 => Arguments(max).init(@min(3, max), ptr),
-            4 => Arguments(max).init(@min(4, max), ptr),
-            5 => Arguments(max).init(@min(5, max), ptr),
-            6 => Arguments(max).init(@min(6, max), ptr),
-            7 => Arguments(max).init(@min(7, max), ptr),
-            8 => Arguments(max).init(@min(8, max), ptr),
+            4 => Arguments(max).init(comptime @min(4, max), ptr),
+            2 => Arguments(max).init(comptime @min(2, max), ptr),
+            6 => Arguments(max).init(comptime @min(6, max), ptr),
+            3 => Arguments(max).init(comptime @min(3, max), ptr),
+            8 => Arguments(max).init(comptime @min(8, max), ptr),
+            5 => Arguments(max).init(comptime @min(5, max), ptr),
+            1 => Arguments(max).init(comptime @min(1, max), ptr),
+            7 => Arguments(max).init(comptime @min(7, max), ptr),
             else => unreachable,
         };
     }
@@ -5555,6 +5613,7 @@ pub const __DOMCall__reader_u64 = @import("../api/bun.zig").FFI.Reader.Class.fun
 pub const __DOMCall__reader_intptr = @import("../api/bun.zig").FFI.Reader.Class.functionDefinitions.intptr;
 pub const __Crypto_getRandomValues = @import("../webcore.zig").Crypto.Class.functionDefinitions.getRandomValues;
 pub const __Crypto_randomUUID = @import("../webcore.zig").Crypto.Class.functionDefinitions.randomUUID;
+pub const __Crypto_randomInt = @import("../webcore.zig").Crypto.Class.functionDefinitions.randomInt;
 pub const __Crypto_timingSafeEqual = @import("../webcore.zig").Crypto.Class.functionDefinitions.timingSafeEqual;
 pub const DOMCalls = .{
     @import("../api/bun.zig").FFI,
diff --git a/src/bun.js/bindings/exports.zig b/src/bun.js/bindings/exports.zig
index 6ea1eba60..e9e9d3a8d 100644
--- a/src/bun.js/bindings/exports.zig
+++ b/src/bun.js/bindings/exports.zig
@@ -29,6 +29,7 @@ const Backtrace = @import("../../crash_reporter.zig");
 const JSPrinter = bun.js_printer;
 const JSLexer = bun.js_lexer;
 const typeBaseName = @import("../../meta.zig").typeBaseName;
+const String = bun.String;
 
 pub const ZigGlobalObject = extern struct {
     pub const shim = Shimmer("Zig", "GlobalObject", @This());
@@ -112,11 +113,11 @@ pub const ErrorCode = enum(ErrorCodeInt) {
     _,
 
     pub inline fn from(code: anyerror) ErrorCode {
-        return @intToEnum(ErrorCode, @errorToInt(code));
+        return @enumFromInt(ErrorCode, @intFromError(code));
     }
 
-    pub const ParserError = @enumToInt(ErrorCode.from(error.ParserError));
-    pub const JSErrorObject = @enumToInt(ErrorCode.from(error.JSErrorObject));
+    pub const ParserError = @intFromEnum(ErrorCode.from(error.ParserError));
+    pub const JSErrorObject = @intFromEnum(ErrorCode.from(error.JSErrorObject));
 
     pub const Type = ErrorCodeInt;
 };
@@ -216,9 +217,10 @@ pub const ResolvedSource = extern struct {
 
     pub const Tag = enum(u64) {
         javascript = 0,
-        wasm = 1,
-        object = 2,
-        file = 3,
+        package_json_type_module = 1,
+        wasm = 2,
+        object = 3,
+        file = 4,
 
         @"node:buffer" = 1024,
         @"node:process" = 1025,
@@ -244,7 +246,7 @@ export fn ZigString__free(raw: [*]const u8, len: usize, allocator_: ?*anyopaque)
 }
 
 export fn ZigString__free_global(ptr: [*]const u8, len: usize) void {
-    var untagged = @intToPtr(*anyopaque, @ptrToInt(ZigString.init(ptr[0..len]).slice().ptr));
+    var untagged = @ptrFromInt(*anyopaque, @intFromPtr(ZigString.init(ptr[0..len]).slice().ptr));
     if (comptime Environment.allow_assert) {
         std.debug.assert(Mimalloc.mi_is_in_heap_region(ptr));
     }
@@ -437,7 +439,7 @@ pub const Process = extern struct {
 };
 
 pub const ZigStackTrace = extern struct {
-    source_lines_ptr: [*c]ZigString,
+    source_lines_ptr: [*c]bun.String,
     source_lines_numbers: [*c]i32,
     source_lines_len: u8,
     source_lines_to_collect: u8,
@@ -455,23 +457,24 @@ pub const ZigStackTrace = extern struct {
         {
             var source_lines_iter = this.sourceLineIterator();
 
-            var source_line_len: usize = 0;
-            var count: usize = 0;
-            while (source_lines_iter.next()) |source| {
-                count += 1;
-                source_line_len += source.text.len;
-            }
+            var source_line_len = source_lines_iter.getLength();
 
-            if (count > 0 and source_line_len > 0) {
-                var source_lines = try allocator.alloc(Api.SourceLine, count);
+            if (source_line_len > 0) {
+                var source_lines = try allocator.alloc(Api.SourceLine, @intCast(usize, @max(source_lines_iter.i + 1, 0)));
                 var source_line_buf = try allocator.alloc(u8, source_line_len);
                 source_lines_iter = this.sourceLineIterator();
                 var remain_buf = source_line_buf[0..];
                 var i: usize = 0;
                 while (source_lines_iter.next()) |source| {
-                    bun.copy(u8, remain_buf, source.text);
-                    const copied_line = remain_buf[0..source.text.len];
-                    remain_buf = remain_buf[source.text.len..];
+                    const text = source.text.slice();
+                    defer source.text.deinit();
+                    bun.copy(
+                        u8,
+                        remain_buf,
+                        text,
+                    );
+                    const copied_line = remain_buf[0..text.len];
+                    remain_buf = remain_buf[text.len..];
                     source_lines[i] = .{ .text = copied_line, .line = source.line };
                     i += 1;
                 }
@@ -507,9 +510,18 @@ pub const ZigStackTrace = extern struct {
 
         pub const SourceLine = struct {
             line: i32,
-            text: string,
+            text: ZigString.Slice,
         };
 
+        pub fn getLength(this: *SourceLineIterator) usize {
+            var count: usize = 0;
+            for (this.trace.source_lines_ptr[0..@intCast(usize, this.i + 1)]) |*line| {
+                count += line.length();
+            }
+
+            return count;
+        }
+
         pub fn untilLast(this: *SourceLineIterator) ?SourceLine {
             if (this.i < 1) return null;
             return this.next();
@@ -521,7 +533,7 @@ pub const ZigStackTrace = extern struct {
             const source_line = this.trace.source_lines_ptr[@intCast(usize, this.i)];
             const result = SourceLine{
                 .line = this.trace.source_lines_numbers[@intCast(usize, this.i)],
-                .text = source_line.slice(),
+                .text = source_line.toUTF8(bun.default_allocator),
             };
             this.i -= 1;
             return result;
@@ -540,28 +552,35 @@ pub const ZigStackTrace = extern struct {
 };
 
 pub const ZigStackFrame = extern struct {
-    function_name: ZigString,
-    source_url: ZigString,
+    function_name: String,
+    source_url: String,
     position: ZigStackFramePosition,
     code_type: ZigStackFrameCode,
 
     /// This informs formatters whether to display as a blob URL or not
     remapped: bool = false,
 
+    pub fn deinit(this: *ZigStackFrame) void {
+        this.function_name.deref();
+        this.source_url.deref();
+    }
+
     pub fn toAPI(this: *const ZigStackFrame, root_path: string, origin: ?*const ZigURL, allocator: std.mem.Allocator) !Api.StackFrame {
         var frame: Api.StackFrame = comptime std.mem.zeroes(Api.StackFrame);
-        if (this.function_name.len > 0) {
-            frame.function_name = try allocator.dupe(u8, this.function_name.slice());
+        if (!this.function_name.isEmpty()) {
+            var slicer = this.function_name.toUTF8(allocator);
+            defer slicer.deinit();
+            frame.function_name = (try slicer.clone(allocator)).slice();
         }
 
-        if (this.source_url.len > 0) {
+        if (!this.source_url.isEmpty()) {
             frame.file = try std.fmt.allocPrint(allocator, "{any}", .{this.sourceURLFormatter(root_path, origin, true, false)});
         }
 
         frame.position.source_offset = this.position.source_offset;
 
         // For remapped code, we add 1 to the line number
-        frame.position.line = this.position.line + @as(i32, @boolToInt(this.remapped));
+        frame.position.line = this.position.line + @as(i32, @intFromBool(this.remapped));
 
         frame.position.line_start = this.position.line_start;
         frame.position.line_stop = this.position.line_stop;
@@ -569,13 +588,13 @@ pub const ZigStackFrame = extern struct {
         frame.position.column_stop = this.position.column_stop;
         frame.position.expression_start = this.position.expression_start;
         frame.position.expression_stop = this.position.expression_stop;
-        frame.scope = @intToEnum(Api.StackFrameScope, @enumToInt(this.code_type));
+        frame.scope = @enumFromInt(Api.StackFrameScope, @intFromEnum(this.code_type));
 
         return frame;
     }
 
     pub const SourceURLFormatter = struct {
-        source_url: ZigString,
+        source_url: bun.String,
         position: ZigStackFramePosition,
         enable_color: bool,
         origin: ?*const ZigURL,
@@ -587,7 +606,9 @@ pub const ZigStackFrame = extern struct {
                 try writer.writeAll(Output.prettyFmt("<r><cyan>", true));
             }
 
-            var source_slice = this.source_url.slice();
+            var source_slice_ = this.source_url.toUTF8(bun.default_allocator);
+            var source_slice = source_slice_.slice();
+            defer source_slice_.deinit();
 
             if (!this.remapped) {
                 if (this.origin) |origin| {
@@ -646,12 +667,12 @@ pub const ZigStackFrame = extern struct {
     };
 
     pub const NameFormatter = struct {
-        function_name: ZigString,
+        function_name: String,
         code_type: ZigStackFrameCode,
         enable_color: bool,
 
         pub fn format(this: NameFormatter, comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
-            const name = this.function_name.slice();
+            const name = this.function_name;
 
             switch (this.code_type) {
                 .Eval => {
@@ -661,26 +682,26 @@ pub const ZigStackFrame = extern struct {
                     // try writer.writeAll("(esm)");
                 },
                 .Function => {
-                    if (name.len > 0) {
+                    if (!name.isEmpty()) {
                         if (this.enable_color) {
-                            try std.fmt.format(writer, comptime Output.prettyFmt("<r><b><i>{s}<r>", true), .{name});
+                            try std.fmt.format(writer, comptime Output.prettyFmt("<r><b><i>{}<r>", true), .{name});
                         } else {
-                            try std.fmt.format(writer, "{s}", .{name});
+                            try std.fmt.format(writer, "{}", .{name});
                         }
                     }
                 },
                 .Global => {
-                    if (name.len > 0) {
-                        try std.fmt.format(writer, "globalThis {s}", .{name});
+                    if (!name.isEmpty()) {
+                        try std.fmt.format(writer, "globalThis {}", .{name});
                     } else {
                         try writer.writeAll("globalThis");
                     }
                 },
                 .Wasm => {
-                    try std.fmt.format(writer, "WASM {s}", .{name});
+                    try std.fmt.format(writer, "WASM {}", .{name});
                 },
                 .Constructor => {
-                    try std.fmt.format(writer, "new {s}", .{name});
+                    try std.fmt.format(writer, "new {}", .{name});
                 },
                 else => {},
             }
@@ -688,9 +709,9 @@ pub const ZigStackFrame = extern struct {
     };
 
     pub const Zero: ZigStackFrame = ZigStackFrame{
-        .function_name = ZigString{ ._unsafe_ptr_do_not_use = "", .len = 0 },
+        .function_name = String.empty,
         .code_type = ZigStackFrameCode.None,
-        .source_url = ZigString{ ._unsafe_ptr_do_not_use = "", .len = 0 },
+        .source_url = String.empty,
         .position = ZigStackFramePosition.Invalid,
     };
 
@@ -743,14 +764,14 @@ pub const ZigException = extern struct {
     /// SystemError only
     errno: c_int = 0,
     /// SystemError only
-    syscall: ZigString = ZigString.Empty,
+    syscall: String = String.empty,
     /// SystemError only
-    system_code: ZigString = ZigString.Empty,
+    system_code: String = String.empty,
     /// SystemError only
-    path: ZigString = ZigString.Empty,
+    path: String = String.empty,
 
-    name: ZigString,
-    message: ZigString,
+    name: String,
+    message: String,
     stack: ZigStackTrace,
 
     exception: ?*anyopaque,
@@ -759,6 +780,19 @@ pub const ZigException = extern struct {
 
     fd: i32 = -1,
 
+    pub fn deinit(this: *ZigException) void {
+        this.syscall.deref();
+        this.system_code.deref();
+        this.path.deref();
+
+        this.name.deref();
+        this.message.deref();
+
+        for (this.stack.frames_ptr[0..this.stack.frames_len]) |*frame| {
+            frame.deinit();
+        }
+    }
+
     pub const shim = Shimmer("Zig", "Exception", @This());
     pub const name = "ZigException";
     pub const namespace = shim.namespace;
@@ -767,7 +801,7 @@ pub const ZigException = extern struct {
         const frame_count = 32;
         pub const source_lines_count = 6;
         source_line_numbers: [source_lines_count]i32,
-        source_lines: [source_lines_count]ZigString,
+        source_lines: [source_lines_count]String,
         frames: [frame_count]ZigStackFrame,
         loaded: bool,
         zig_exception: ZigException,
@@ -775,18 +809,18 @@ pub const ZigException = extern struct {
         pub const Zero: Holder = Holder{
             .frames = brk: {
                 var _frames: [frame_count]ZigStackFrame = undefined;
-                std.mem.set(ZigStackFrame, &_frames, ZigStackFrame.Zero);
+                @memset(&_frames, ZigStackFrame.Zero);
                 break :brk _frames;
             },
             .source_line_numbers = brk: {
                 var lines: [source_lines_count]i32 = undefined;
-                std.mem.set(i32, &lines, -1);
+                @memset(&lines, -1);
                 break :brk lines;
             },
 
             .source_lines = brk: {
-                var lines: [source_lines_count]ZigString = undefined;
-                std.mem.set(ZigString, &lines, ZigString.Empty);
+                var lines: [source_lines_count]String = undefined;
+                @memset(&lines, String.empty);
                 break :brk lines;
             },
             .zig_exception = undefined,
@@ -797,13 +831,17 @@ pub const ZigException = extern struct {
             return Holder.Zero;
         }
 
+        pub fn deinit(this: *Holder) void {
+            this.zigException().deinit();
+        }
+
         pub fn zigException(this: *Holder) *ZigException {
             if (!this.loaded) {
                 this.zig_exception = ZigException{
-                    .code = @intToEnum(JSErrorCode, 255),
+                    .code = @enumFromInt(JSErrorCode, 255),
                     .runtime_type = JSRuntimeType.Nothing,
-                    .name = ZigString.Empty,
-                    .message = ZigString.Empty,
+                    .name = String.empty,
+                    .message = String.empty,
                     .exception = null,
                     .stack = ZigStackTrace{
                         .source_lines_ptr = &this.source_lines,
@@ -831,13 +869,18 @@ pub const ZigException = extern struct {
         root_path: string,
         origin: ?*const ZigURL,
     ) !void {
-        const _name: string = @field(this, "name").slice();
-        const message: string = @field(this, "message").slice();
+        const name_slice = @field(this, "name").toUTF8(bun.default_allocator);
+        const message_slice = @field(this, "message").toUTF8(bun.default_allocator);
+
+        const _name = name_slice.slice();
+        defer name_slice.deinit();
+        const message = message_slice.slice();
+        defer message_slice.deinit();
 
         var is_empty = true;
         var api_exception = Api.JsException{
-            .runtime_type = @enumToInt(this.runtime_type),
-            .code = @enumToInt(this.code),
+            .runtime_type = @intFromEnum(this.runtime_type),
+            .code = @intFromEnum(this.code),
         };
 
         if (_name.len > 0) {
@@ -1314,7 +1357,7 @@ pub const ZigConsoleClient = struct {
             };
 
             pub fn getAdvanced(value: JSValue, globalThis: *JSGlobalObject, opts: Options) Result {
-                switch (@enumToInt(value)) {
+                switch (@intFromEnum(value)) {
                     0, 0xa => return Result{
                         .tag = .Undefined,
                     },
@@ -1374,23 +1417,20 @@ pub const ZigConsoleClient = struct {
 
                 // If we check an Object has a method table and it does not
                 // it will crash
-                const callable = js_type != .Object and value.isCallable(globalThis.vm());
-
-                if (value.isClass(globalThis) and !callable) {
-                    return .{
-                        .tag = .Object,
-                        .cell = js_type,
-                    };
-                }
+                if (js_type != .Object and value.isCallable(globalThis.vm())) {
+                    if (value.isClass(globalThis)) {
+                        return .{
+                            .tag = .Class,
+                            .cell = js_type,
+                        };
+                    }
 
-                if (callable and js_type == .JSFunction) {
-                    return .{
-                        .tag = .Function,
-                        .cell = js_type,
-                    };
-                } else if (callable and js_type == .InternalFunction) {
                     return .{
-                        .tag = .Object,
+                        // TODO: we print InternalFunction as Object because we have a lot of
+                        // callable namespaces and printing the contents of it is better than [Function: namespace]
+                        // ideally, we would print [Function: namespace] { ... } on all functions, internal and js.
+                        // what we'll do later is rid of .Function and .Class and handle the prefix in the .Object formatter
+                        .tag = if (js_type == .InternalFunction) .Object else .Function,
                         .cell = js_type,
                     };
                 }
@@ -1713,7 +1753,7 @@ pub const ZigConsoleClient = struct {
                 parent: JSValue,
                 const enable_ansi_colors = enable_ansi_colors_;
                 pub fn handleFirstProperty(this: *@This(), globalThis: *JSC.JSGlobalObject, value: JSValue) void {
-                    if (!value.jsType().isFunction() and !value.isClass(globalThis)) {
+                    if (!value.jsType().isFunction()) {
                         var writer = WrappedWriter(Writer){
                             .ctx = this.writer,
                             .failed = false,
@@ -1878,7 +1918,7 @@ pub const ZigConsoleClient = struct {
                     this.map = this.map_node.?.data;
                 }
 
-                var entry = this.map.getOrPut(@enumToInt(value)) catch unreachable;
+                var entry = this.map.getOrPut(@intFromEnum(value)) catch unreachable;
                 if (entry.found_existing) {
                     writer.writeAll(comptime Output.prettyFmt("<r><cyan>[Circular]<r>", enable_ansi_colors));
                     return;
@@ -1887,7 +1927,7 @@ pub const ZigConsoleClient = struct {
 
             defer {
                 if (comptime Format.canHaveCircularReferences()) {
-                    _ = this.map.remove(@enumToInt(value));
+                    _ = this.map.remove(@intFromEnum(value));
                 }
             }
 
@@ -1959,7 +1999,7 @@ pub const ZigConsoleClient = struct {
                             i = -i;
                         }
                         const digits = if (i != 0)
-                            bun.fmt.fastDigitCount(@intCast(usize, i)) + @as(usize, @boolToInt(is_negative))
+                            bun.fmt.fastDigitCount(@intCast(usize, i)) + @as(usize, @intFromBool(is_negative))
                         else
                             1;
                         this.addForNewLine(digits);
@@ -2051,9 +2091,9 @@ pub const ZigConsoleClient = struct {
                     this.addForNewLine(printable.len);
 
                     if (printable.len == 0) {
-                        writer.print(comptime Output.prettyFmt("[class]", enable_ansi_colors), .{});
+                        writer.print(comptime Output.prettyFmt("<cyan>[class]<r>", enable_ansi_colors), .{});
                     } else {
-                        writer.print(comptime Output.prettyFmt("[class <cyan>{}<r>]", enable_ansi_colors), .{printable});
+                        writer.print(comptime Output.prettyFmt("<cyan>[class {}]<r>", enable_ansi_colors), .{printable});
                     }
                 },
                 .Function => {
@@ -2063,7 +2103,7 @@ pub const ZigConsoleClient = struct {
                     if (printable.len == 0) {
                         writer.print(comptime Output.prettyFmt("<cyan>[Function]<r>", enable_ansi_colors), .{});
                     } else {
-                        writer.print(comptime Output.prettyFmt("<cyan>[Function<d>:<r> <cyan>{}]<r>", enable_ansi_colors), .{printable});
+                        writer.print(comptime Output.prettyFmt("<cyan>[Function: {}]<r>", enable_ansi_colors), .{printable});
                     }
                 },
                 .Getter => {
@@ -2220,11 +2260,11 @@ pub const ZigConsoleClient = struct {
                     } else if (value.as(JSC.ResolveMessage)) |resolve_log| {
                         resolve_log.msg.writeFormat(writer_, enable_ansi_colors) catch {};
                         return;
-                    } else if (value.as(JSC.Jest.ExpectAnything) != null) {
+                    } else if (value.as(JSC.Expect.ExpectAnything) != null) {
                         writer.writeAll("Anything");
                         return;
-                    } else if (value.as(JSC.Jest.ExpectAny) != null) {
-                        const constructor_value = JSC.Jest.ExpectAny.constructorValueGetCached(value) orelse return;
+                    } else if (value.as(JSC.Expect.ExpectAny) != null) {
+                        const constructor_value = JSC.Expect.ExpectAny.constructorValueGetCached(value) orelse return;
 
                         this.addForNewLine("Any<".len);
                         writer.writeAll("Any<");
@@ -2237,16 +2277,16 @@ pub const ZigConsoleClient = struct {
                         writer.writeAll(">");
 
                         return;
-                    } else if (value.as(JSC.Jest.ExpectStringContaining) != null) {
-                        const substring_value = JSC.Jest.ExpectStringContaining.stringValueGetCached(value) orelse return;
+                    } else if (value.as(JSC.Expect.ExpectStringContaining) != null) {
+                        const substring_value = JSC.Expect.ExpectStringContaining.stringValueGetCached(value) orelse return;
 
                         this.addForNewLine("StringContaining ".len);
                         writer.writeAll("StringContaining ");
                         this.printAs(.String, Writer, writer_, substring_value, .String, enable_ansi_colors);
 
                         return;
-                    } else if (value.as(JSC.Jest.ExpectStringMatching) != null) {
-                        const test_value = JSC.Jest.ExpectStringMatching.testValueGetCached(value) orelse return;
+                    } else if (value.as(JSC.Expect.ExpectStringMatching) != null) {
+                        const test_value = JSC.Expect.ExpectStringMatching.testValueGetCached(value) orelse return;
 
                         this.addForNewLine("StringMatching ".len);
                         writer.writeAll("StringMatching ");
@@ -2559,7 +2599,7 @@ pub const ZigConsoleClient = struct {
                             {
                                 this.indent += 1;
                                 defer this.indent -|= 1;
-                                const count_without_children = props_iter.len - @as(usize, @boolToInt(children_prop != null));
+                                const count_without_children = props_iter.len - @as(usize, @intFromBool(children_prop != null));
 
                                 while (props_iter.next()) |prop| {
                                     if (prop.eqlComptime("children"))
@@ -2759,7 +2799,7 @@ pub const ZigConsoleClient = struct {
                     }
 
                     if (iter.i == 0) {
-                        if (value.isClass(this.globalThis) and !value.isCallable(this.globalThis.vm()))
+                        if (value.isClass(this.globalThis))
                             this.printAs(.Class, Writer, writer_, value, jsType, enable_ansi_colors)
                         else if (value.isCallable(this.globalThis.vm()))
                             this.printAs(.Function, Writer, writer_, value, jsType, enable_ansi_colors)
@@ -3002,7 +3042,7 @@ pub const ZigConsoleClient = struct {
         chars: [*]const u8,
         len: usize,
     ) callconv(.C) void {
-        const id = std.hash.Wyhash.hash(0, chars[0..len]);
+        const id = bun.hash(chars[0..len]);
         if (!pending_time_logs_loaded) {
             pending_time_logs = PendingTimers.init(default_allocator);
             pending_time_logs_loaded = true;
@@ -3026,12 +3066,12 @@ pub const ZigConsoleClient = struct {
             return;
         }
 
-        const id = std.hash.Wyhash.hash(0, chars[0..len]);
+        const id = bun.hash(chars[0..len]);
         var result = (pending_time_logs.fetchPut(id, null) catch null) orelse return;
         var value: std.time.Timer = result.value orelse return;
         // get the duration in microseconds
         // then display it in milliseconds
-        Output.printElapsed(@intToFloat(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
+        Output.printElapsed(@floatFromInt(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
         switch (len) {
             0 => Output.printErrorln("\n", .{}),
             else => Output.printErrorln(" {s}", .{chars[0..len]}),
@@ -3056,11 +3096,11 @@ pub const ZigConsoleClient = struct {
             return;
         }
 
-        const id = std.hash.Wyhash.hash(0, chars[0..len]);
+        const id = bun.hash(chars[0..len]);
         var value: std.time.Timer = (pending_time_logs.get(id) orelse return) orelse return;
         // get the duration in microseconds
         // then display it in milliseconds
-        Output.printElapsed(@intToFloat(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
+        Output.printElapsed(@floatFromInt(f64, value.read() / std.time.ns_per_us) / std.time.us_per_ms);
         switch (len) {
             0 => Output.printErrorln("\n", .{}),
             else => Output.printErrorln(" {s}", .{chars[0..len]}),
diff --git a/src/bun.js/bindings/generated_classes.zig b/src/bun.js/bindings/generated_classes.zig
index 0ec65a469..171bba792 100644
--- a/src/bun.js/bindings/generated_classes.zig
+++ b/src/bun.js/bindings/generated_classes.zig
@@ -108,6 +108,8 @@ pub const JSBlob = struct {
 
         if (@TypeOf(Blob.getArrayBuffer) != CallbackType)
             @compileLog("Expected Blob.getArrayBuffer to be a callback but received " ++ @typeName(@TypeOf(Blob.getArrayBuffer)));
+        if (@TypeOf(Blob.getExists) != CallbackType)
+            @compileLog("Expected Blob.getExists to be a callback but received " ++ @typeName(@TypeOf(Blob.getExists)));
         if (@TypeOf(Blob.getFormData) != CallbackType)
             @compileLog("Expected Blob.getFormData to be a callback but received " ++ @typeName(@TypeOf(Blob.getFormData)));
         if (@TypeOf(Blob.getJSON) != CallbackType)
@@ -136,6 +138,7 @@ pub const JSBlob = struct {
             @export(Blob.constructor, .{ .name = "BlobClass__construct" });
             @export(Blob.finalize, .{ .name = "BlobClass__finalize" });
             @export(Blob.getArrayBuffer, .{ .name = "BlobPrototype__getArrayBuffer" });
+            @export(Blob.getExists, .{ .name = "BlobPrototype__getExists" });
             @export(Blob.getFormData, .{ .name = "BlobPrototype__getFormData" });
             @export(Blob.getJSON, .{ .name = "BlobPrototype__getJSON" });
             @export(Blob.getLastModified, .{ .name = "BlobPrototype__getLastModified" });
@@ -1406,6 +1409,97 @@ pub const JSExpectStringMatching = struct {
         }
     }
 };
+pub const JSFSWatcher = struct {
+    const FSWatcher = Classes.FSWatcher;
+    const GetterType = fn (*FSWatcher, *JSC.JSGlobalObject) callconv(.C) JSC.JSValue;
+    const GetterTypeWithThisValue = fn (*FSWatcher, JSC.JSValue, *JSC.JSGlobalObject) callconv(.C) JSC.JSValue;
+    const SetterType = fn (*FSWatcher, *JSC.JSGlobalObject, JSC.JSValue) callconv(.C) bool;
+    const SetterTypeWithThisValue = fn (*FSWatcher, JSC.JSValue, *JSC.JSGlobalObject, JSC.JSValue) callconv(.C) bool;
+    const CallbackType = fn (*FSWatcher, *JSC.JSGlobalObject, *JSC.CallFrame) callconv(.C) JSC.JSValue;
+
+    /// Return the pointer to the wrapped object.
+    /// If the object does not match the type, return null.
+    pub fn fromJS(value: JSC.JSValue) ?*FSWatcher {
+        JSC.markBinding(@src());
+        return FSWatcher__fromJS(value);
+    }
+
+    extern fn FSWatcherPrototype__listenerSetCachedValue(JSC.JSValue, *JSC.JSGlobalObject, JSC.JSValue) void;
+
+    extern fn FSWatcherPrototype__listenerGetCachedValue(JSC.JSValue) JSC.JSValue;
+
+    /// `FSWatcher.listener` setter
+    /// This value will be visited by the garbage collector.
+    pub fn listenerSetCached(thisValue: JSC.JSValue, globalObject: *JSC.JSGlobalObject, value: JSC.JSValue) void {
+        JSC.markBinding(@src());
+        FSWatcherPrototype__listenerSetCachedValue(thisValue, globalObject, value);
+    }
+
+    /// `FSWatcher.listener` getter
+    /// This value will be visited by the garbage collector.
+    pub fn listenerGetCached(thisValue: JSC.JSValue) ?JSC.JSValue {
+        JSC.markBinding(@src());
+        const result = FSWatcherPrototype__listenerGetCachedValue(thisValue);
+        if (result == .zero)
+            return null;
+
+        return result;
+    }
+
+    /// Create a new instance of FSWatcher
+    pub fn toJS(this: *FSWatcher, globalObject: *JSC.JSGlobalObject) JSC.JSValue {
+        JSC.markBinding(@src());
+        if (comptime Environment.allow_assert) {
+            const value__ = FSWatcher__create(globalObject, this);
+            std.debug.assert(value__.as(FSWatcher).? == this); // If this fails, likely a C ABI issue.
+            return value__;
+        } else {
+            return FSWatcher__create(globalObject, this);
+        }
+    }
+
+    /// Modify the internal ptr to point to a new instance of FSWatcher.
+    pub fn dangerouslySetPtr(value: JSC.JSValue, ptr: ?*FSWatcher) bool {
+        JSC.markBinding(@src());
+        return FSWatcher__dangerouslySetPtr(value, ptr);
+    }
+
+    /// Detach the ptr from the thisValue
+    pub fn detachPtr(_: *FSWatcher, value: JSC.JSValue) void {
+        JSC.markBinding(@src());
+        std.debug.assert(FSWatcher__dangerouslySetPtr(value, null));
+    }
+
+    extern fn FSWatcher__fromJS(JSC.JSValue) ?*FSWatcher;
+    extern fn FSWatcher__getConstructor(*JSC.JSGlobalObject) JSC.JSValue;
+
+    extern fn FSWatcher__create(globalObject: *JSC.JSGlobalObject, ptr: ?*FSWatcher) JSC.JSValue;
+
+    extern fn FSWatcher__dangerouslySetPtr(JSC.JSValue, ?*FSWatcher) bool;
+
+    comptime {
+        if (@TypeOf(FSWatcher.finalize) != (fn (*FSWatcher) callconv(.C) void)) {
+            @compileLog("FSWatcher.finalize is not a finalizer");
+        }
+
+        if (@TypeOf(FSWatcher.doClose) != CallbackType)
+            @compileLog("Expected FSWatcher.doClose to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.doClose)));
+        if (@TypeOf(FSWatcher.hasRef) != CallbackType)
+            @compileLog("Expected FSWatcher.hasRef to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.hasRef)));
+        if (@TypeOf(FSWatcher.doRef) != CallbackType)
+            @compileLog("Expected FSWatcher.doRef to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.doRef)));
+        if (@TypeOf(FSWatcher.doUnref) != CallbackType)
+            @compileLog("Expected FSWatcher.doUnref to be a callback but received " ++ @typeName(@TypeOf(FSWatcher.doUnref)));
+        if (!JSC.is_bindgen) {
+            @export(FSWatcher.doClose, .{ .name = "FSWatcherPrototype__doClose" });
+            @export(FSWatcher.doRef, .{ .name = "FSWatcherPrototype__doRef" });
+            @export(FSWatcher.doUnref, .{ .name = "FSWatcherPrototype__doUnref" });
+            @export(FSWatcher.finalize, .{ .name = "FSWatcherClass__finalize" });
+            @export(FSWatcher.hasPendingActivity, .{ .name = "FSWatcher__hasPendingActivity" });
+            @export(FSWatcher.hasRef, .{ .name = "FSWatcherPrototype__hasRef" });
+        }
+    }
+};
 pub const JSFileSystemRouter = struct {
     const FileSystemRouter = Classes.FileSystemRouter;
     const GetterType = fn (*FileSystemRouter, *JSC.JSGlobalObject) callconv(.C) JSC.JSValue;
@@ -2312,6 +2406,8 @@ pub const JSNodeJSFS = struct {
             @compileLog("Expected NodeJSFS.utimes to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.utimes)));
         if (@TypeOf(NodeJSFS.utimesSync) != CallbackType)
             @compileLog("Expected NodeJSFS.utimesSync to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.utimesSync)));
+        if (@TypeOf(NodeJSFS.watch) != CallbackType)
+            @compileLog("Expected NodeJSFS.watch to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.watch)));
         if (@TypeOf(NodeJSFS.write) != CallbackType)
             @compileLog("Expected NodeJSFS.write to be a callback but received " ++ @typeName(@TypeOf(NodeJSFS.write)));
         if (@TypeOf(NodeJSFS.writeFile) != CallbackType)
@@ -2402,6 +2498,7 @@ pub const JSNodeJSFS = struct {
             @export(NodeJSFS.unlinkSync, .{ .name = "NodeJSFSPrototype__unlinkSync" });
             @export(NodeJSFS.utimes, .{ .name = "NodeJSFSPrototype__utimes" });
             @export(NodeJSFS.utimesSync, .{ .name = "NodeJSFSPrototype__utimesSync" });
+            @export(NodeJSFS.watch, .{ .name = "NodeJSFSPrototype__watch" });
             @export(NodeJSFS.write, .{ .name = "NodeJSFSPrototype__write" });
             @export(NodeJSFS.writeFile, .{ .name = "NodeJSFSPrototype__writeFile" });
             @export(NodeJSFS.writeFileSync, .{ .name = "NodeJSFSPrototype__writeFileSync" });
@@ -4329,6 +4426,9 @@ pub const JSTCPSocket = struct {
             @compileLog("TCPSocket.finalize is not a finalizer");
         }
 
+        if (@TypeOf(TCPSocket.getALPNProtocol) != GetterType)
+            @compileLog("Expected TCPSocket.getALPNProtocol to be a getter");
+
         if (@TypeOf(TCPSocket.getAuthorized) != GetterType)
             @compileLog("Expected TCPSocket.getAuthorized to be a getter");
 
@@ -4359,18 +4459,23 @@ pub const JSTCPSocket = struct {
         if (@TypeOf(TCPSocket.getRemoteAddress) != GetterType)
             @compileLog("Expected TCPSocket.getRemoteAddress to be a getter");
 
+        if (@TypeOf(TCPSocket.setServername) != CallbackType)
+            @compileLog("Expected TCPSocket.setServername to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.setServername)));
         if (@TypeOf(TCPSocket.shutdown) != CallbackType)
             @compileLog("Expected TCPSocket.shutdown to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.shutdown)));
         if (@TypeOf(TCPSocket.timeout) != CallbackType)
             @compileLog("Expected TCPSocket.timeout to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.timeout)));
         if (@TypeOf(TCPSocket.unref) != CallbackType)
             @compileLog("Expected TCPSocket.unref to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.unref)));
+        if (@TypeOf(TCPSocket.upgradeTLS) != CallbackType)
+            @compileLog("Expected TCPSocket.upgradeTLS to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.upgradeTLS)));
         if (@TypeOf(TCPSocket.write) != CallbackType)
             @compileLog("Expected TCPSocket.write to be a callback but received " ++ @typeName(@TypeOf(TCPSocket.write)));
         if (!JSC.is_bindgen) {
             @export(TCPSocket.end, .{ .name = "TCPSocketPrototype__end" });
             @export(TCPSocket.finalize, .{ .name = "TCPSocketClass__finalize" });
             @export(TCPSocket.flush, .{ .name = "TCPSocketPrototype__flush" });
+            @export(TCPSocket.getALPNProtocol, .{ .name = "TCPSocketPrototype__getALPNProtocol" });
             @export(TCPSocket.getAuthorizationError, .{ .name = "TCPSocketPrototype__getAuthorizationError" });
             @export(TCPSocket.getAuthorized, .{ .name = "TCPSocketPrototype__getAuthorized" });
             @export(TCPSocket.getData, .{ .name = "TCPSocketPrototype__getData" });
@@ -4382,9 +4487,11 @@ pub const JSTCPSocket = struct {
             @export(TCPSocket.ref, .{ .name = "TCPSocketPrototype__ref" });
             @export(TCPSocket.reload, .{ .name = "TCPSocketPrototype__reload" });
             @export(TCPSocket.setData, .{ .name = "TCPSocketPrototype__setData" });
+            @export(TCPSocket.setServername, .{ .name = "TCPSocketPrototype__setServername" });
             @export(TCPSocket.shutdown, .{ .name = "TCPSocketPrototype__shutdown" });
             @export(TCPSocket.timeout, .{ .name = "TCPSocketPrototype__timeout" });
             @export(TCPSocket.unref, .{ .name = "TCPSocketPrototype__unref" });
+            @export(TCPSocket.upgradeTLS, .{ .name = "TCPSocketPrototype__upgradeTLS" });
             @export(TCPSocket.write, .{ .name = "TCPSocketPrototype__write" });
         }
     }
@@ -4484,6 +4591,9 @@ pub const JSTLSSocket = struct {
             @compileLog("TLSSocket.finalize is not a finalizer");
         }
 
+        if (@TypeOf(TLSSocket.getALPNProtocol) != GetterType)
+            @compileLog("Expected TLSSocket.getALPNProtocol to be a getter");
+
         if (@TypeOf(TLSSocket.getAuthorized) != GetterType)
             @compileLog("Expected TLSSocket.getAuthorized to be a getter");
 
@@ -4514,18 +4624,23 @@ pub const JSTLSSocket = struct {
         if (@TypeOf(TLSSocket.getRemoteAddress) != GetterType)
             @compileLog("Expected TLSSocket.getRemoteAddress to be a getter");
 
+        if (@TypeOf(TLSSocket.setServername) != CallbackType)
+            @compileLog("Expected TLSSocket.setServername to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.setServername)));
         if (@TypeOf(TLSSocket.shutdown) != CallbackType)
             @compileLog("Expected TLSSocket.shutdown to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.shutdown)));
         if (@TypeOf(TLSSocket.timeout) != CallbackType)
             @compileLog("Expected TLSSocket.timeout to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.timeout)));
         if (@TypeOf(TLSSocket.unref) != CallbackType)
             @compileLog("Expected TLSSocket.unref to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.unref)));
+        if (@TypeOf(TLSSocket.upgradeTLS) != CallbackType)
+            @compileLog("Expected TLSSocket.upgradeTLS to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.upgradeTLS)));
         if (@TypeOf(TLSSocket.write) != CallbackType)
             @compileLog("Expected TLSSocket.write to be a callback but received " ++ @typeName(@TypeOf(TLSSocket.write)));
         if (!JSC.is_bindgen) {
             @export(TLSSocket.end, .{ .name = "TLSSocketPrototype__end" });
             @export(TLSSocket.finalize, .{ .name = "TLSSocketClass__finalize" });
             @export(TLSSocket.flush, .{ .name = "TLSSocketPrototype__flush" });
+            @export(TLSSocket.getALPNProtocol, .{ .name = "TLSSocketPrototype__getALPNProtocol" });
             @export(TLSSocket.getAuthorizationError, .{ .name = "TLSSocketPrototype__getAuthorizationError" });
             @export(TLSSocket.getAuthorized, .{ .name = "TLSSocketPrototype__getAuthorized" });
             @export(TLSSocket.getData, .{ .name = "TLSSocketPrototype__getData" });
@@ -4537,9 +4652,11 @@ pub const JSTLSSocket = struct {
             @export(TLSSocket.ref, .{ .name = "TLSSocketPrototype__ref" });
             @export(TLSSocket.reload, .{ .name = "TLSSocketPrototype__reload" });
             @export(TLSSocket.setData, .{ .name = "TLSSocketPrototype__setData" });
+            @export(TLSSocket.setServername, .{ .name = "TLSSocketPrototype__setServername" });
             @export(TLSSocket.shutdown, .{ .name = "TLSSocketPrototype__shutdown" });
             @export(TLSSocket.timeout, .{ .name = "TLSSocketPrototype__timeout" });
             @export(TLSSocket.unref, .{ .name = "TLSSocketPrototype__unref" });
+            @export(TLSSocket.upgradeTLS, .{ .name = "TLSSocketPrototype__upgradeTLS" });
             @export(TLSSocket.write, .{ .name = "TLSSocketPrototype__write" });
         }
     }
@@ -4855,6 +4972,7 @@ comptime {
     _ = JSExpectAnything;
     _ = JSExpectStringContaining;
     _ = JSExpectStringMatching;
+    _ = JSFSWatcher;
     _ = JSFileSystemRouter;
     _ = JSListener;
     _ = JSMD4;
diff --git a/src/bun.js/bindings/generated_classes_list.zig b/src/bun.js/bindings/generated_classes_list.zig
index d5d987dce..543d492b5 100644
--- a/src/bun.js/bindings/generated_classes_list.zig
+++ b/src/bun.js/bindings/generated_classes_list.zig
@@ -4,11 +4,11 @@ pub const Classes = struct {
     pub const Blob = JSC.WebCore.Blob;
     pub const CryptoHasher = JSC.API.Bun.Crypto.CryptoHasher;
     pub const Dirent = JSC.Node.Dirent;
-    pub const Expect = JSC.Jest.Expect;
-    pub const ExpectAny = JSC.Jest.ExpectAny;
-    pub const ExpectAnything = JSC.Jest.ExpectAnything;
-    pub const ExpectStringContaining = JSC.Jest.ExpectStringContaining;
-    pub const ExpectStringMatching = JSC.Jest.ExpectStringMatching;
+    pub const Expect = JSC.Expect.Expect;
+    pub const ExpectAny = JSC.Expect.ExpectAny;
+    pub const ExpectAnything = JSC.Expect.ExpectAnything;
+    pub const ExpectStringContaining = JSC.Expect.ExpectStringContaining;
+    pub const ExpectStringMatching = JSC.Expect.ExpectStringMatching;
     pub const FileSystemRouter = JSC.API.FileSystemRouter;
     pub const Bundler = JSC.API.JSBundler;
     pub const JSBundler = Bundler;
@@ -37,4 +37,5 @@ pub const Classes = struct {
     pub const BuildArtifact = JSC.API.BuildArtifact;
     pub const BuildMessage = JSC.BuildMessage;
     pub const ResolveMessage = JSC.ResolveMessage;
+    pub const FSWatcher = JSC.Node.FSWatcher;
 };
diff --git a/src/bun.js/bindings/header-gen.zig b/src/bun.js/bindings/header-gen.zig
index 089506a8f..eb0de1c09 100644
--- a/src/bun.js/bindings/header-gen.zig
+++ b/src/bun.js/bindings/header-gen.zig
@@ -807,7 +807,7 @@ pub fn HeaderGen(comptime first_import: type, comptime second_import: type, comp
                                                 }
                                             };
                                             var extern_list = Type.Extern;
-                                            std.sort.sort([]const u8, &extern_list, Sorder{}, Sorder.lessThan);
+                                            std.sort.block([]const u8, &extern_list, Sorder{}, Sorder.lessThan);
                                             break :brk extern_list;
                                         };
                                         // impl_writer.print("  #include {s}\n", .{Type.include}) catch unreachable;
@@ -840,7 +840,7 @@ pub fn HeaderGen(comptime first_import: type, comptime second_import: type, comp
                                                 }
                                             };
                                             var extern_list = Type.Export;
-                                            std.sort.sort(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
+                                            std.sort.block(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
                                             break :brk extern_list;
                                         };
 
@@ -867,7 +867,7 @@ pub fn HeaderGen(comptime first_import: type, comptime second_import: type, comp
                                     //             }
                                     //         };
                                     //         var extern_list = Type.lazy_static_functions;
-                                    //         std.sort.sort(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
+                                    //         std.sort.block(StaticExport, &extern_list, Sorder{}, Sorder.lessThan);
                                     //         break :brk extern_list;
                                     //     };
 
diff --git a/src/bun.js/bindings/headers-handwritten.h b/src/bun.js/bindings/headers-handwritten.h
index 57940550f..90c8f86d2 100644
--- a/src/bun.js/bindings/headers-handwritten.h
+++ b/src/bun.js/bindings/headers-handwritten.h
@@ -72,6 +72,7 @@ typedef struct ResolvedSource {
     void* allocator;
     uint64_t tag;
 } ResolvedSource;
+static const uint64_t ResolvedSourceTagPackageJSONTypeModule = 1;
 typedef union ErrorableResolvedSourceResult {
     ResolvedSource value;
     ZigErrorType err;
@@ -83,10 +84,10 @@ typedef struct ErrorableResolvedSource {
 
 typedef struct SystemError {
     int errno_;
-    ZigString code;
-    ZigString message;
-    ZigString path;
-    ZigString syscall;
+    BunString code;
+    BunString message;
+    BunString path;
+    BunString syscall;
     int fd;
 } SystemError;
 
@@ -119,15 +120,15 @@ typedef struct ZigStackFramePosition {
 } ZigStackFramePosition;
 
 typedef struct ZigStackFrame {
-    ZigString function_name;
-    ZigString source_url;
+    BunString function_name;
+    BunString source_url;
     ZigStackFramePosition position;
     ZigStackFrameCode code_type;
     bool remapped;
 } ZigStackFrame;
 
 typedef struct ZigStackTrace {
-    ZigString* source_lines_ptr;
+    BunString* source_lines_ptr;
     int32_t* source_lines_numbers;
     uint8_t source_lines_len;
     uint8_t source_lines_to_collect;
@@ -139,11 +140,11 @@ typedef struct ZigException {
     unsigned char code;
     uint16_t runtime_type;
     int errno_;
-    ZigString syscall;
-    ZigString code_;
-    ZigString path;
-    ZigString name;
-    ZigString message;
+    BunString syscall;
+    BunString code_;
+    BunString path;
+    BunString name;
+    BunString message;
     ZigStackTrace stack;
     void* exception;
     bool remapped;
@@ -245,6 +246,10 @@ BunString toString(WTF::String& wtfString);
 BunString toString(const WTF::String& wtfString);
 BunString toString(WTF::StringImpl* wtfString);
 
+BunString toStringRef(JSC::JSGlobalObject* globalObject, JSC::JSValue value);
+BunString toStringRef(WTF::String& wtfString);
+BunString toStringRef(const WTF::String& wtfString);
+BunString toStringRef(WTF::StringImpl* wtfString);
 }
 
 using Uint8Array_alias = JSC::JSUint8Array;
diff --git a/src/bun.js/bindings/headers.h b/src/bun.js/bindings/headers.h
index cdf7e05f4..f507121f8 100644
--- a/src/bun.js/bindings/headers.h
+++ b/src/bun.js/bindings/headers.h
@@ -253,6 +253,7 @@ CPP_DECL JSC__JSPromise* JSC__JSPromise__resolvedPromise(JSC__JSGlobalObject* ar
 CPP_DECL JSC__JSValue JSC__JSPromise__resolvedPromiseValue(JSC__JSGlobalObject* arg0, JSC__JSValue JSValue1);
 CPP_DECL void JSC__JSPromise__resolveOnNextTick(JSC__JSPromise* arg0, JSC__JSGlobalObject* arg1, JSC__JSValue JSValue2);
 CPP_DECL JSC__JSValue JSC__JSPromise__result(JSC__JSPromise* arg0, JSC__VM* arg1);
+CPP_DECL void JSC__JSPromise__setHandled(JSC__JSPromise* arg0, JSC__VM* arg1);
 CPP_DECL uint32_t JSC__JSPromise__status(const JSC__JSPromise* arg0, JSC__VM* arg1);
 
 #pragma mark - JSC::JSInternalPromise
@@ -267,6 +268,7 @@ CPP_DECL void JSC__JSInternalPromise__rejectWithCaughtException(JSC__JSInternalP
 CPP_DECL void JSC__JSInternalPromise__resolve(JSC__JSInternalPromise* arg0, JSC__JSGlobalObject* arg1, JSC__JSValue JSValue2);
 CPP_DECL JSC__JSInternalPromise* JSC__JSInternalPromise__resolvedPromise(JSC__JSGlobalObject* arg0, JSC__JSValue JSValue1);
 CPP_DECL JSC__JSValue JSC__JSInternalPromise__result(const JSC__JSInternalPromise* arg0, JSC__VM* arg1);
+CPP_DECL void JSC__JSInternalPromise__setHandled(JSC__JSInternalPromise* arg0, JSC__VM* arg1);
 CPP_DECL uint32_t JSC__JSInternalPromise__status(const JSC__JSInternalPromise* arg0, JSC__VM* arg1);
 
 #pragma mark - JSC::JSFunction
diff --git a/src/bun.js/bindings/headers.zig b/src/bun.js/bindings/headers.zig
index 4dda5f30b..666369b21 100644
--- a/src/bun.js/bindings/headers.zig
+++ b/src/bun.js/bindings/headers.zig
@@ -168,6 +168,7 @@ pub extern fn JSC__JSPromise__resolvedPromise(arg0: *bindings.JSGlobalObject, JS
 pub extern fn JSC__JSPromise__resolvedPromiseValue(arg0: *bindings.JSGlobalObject, JSValue1: JSC__JSValue) JSC__JSValue;
 pub extern fn JSC__JSPromise__resolveOnNextTick(arg0: ?*bindings.JSPromise, arg1: *bindings.JSGlobalObject, JSValue2: JSC__JSValue) void;
 pub extern fn JSC__JSPromise__result(arg0: ?*bindings.JSPromise, arg1: *bindings.VM) JSC__JSValue;
+pub extern fn JSC__JSPromise__setHandled(arg0: ?*bindings.JSPromise, arg1: *bindings.VM) void;
 pub extern fn JSC__JSPromise__status(arg0: [*c]const JSC__JSPromise, arg1: *bindings.VM) u32;
 pub extern fn JSC__JSInternalPromise__create(arg0: *bindings.JSGlobalObject) [*c]bindings.JSInternalPromise;
 pub extern fn JSC__JSInternalPromise__isHandled(arg0: [*c]const JSC__JSInternalPromise, arg1: *bindings.VM) bool;
@@ -179,6 +180,7 @@ pub extern fn JSC__JSInternalPromise__rejectWithCaughtException(arg0: [*c]bindin
 pub extern fn JSC__JSInternalPromise__resolve(arg0: [*c]bindings.JSInternalPromise, arg1: *bindings.JSGlobalObject, JSValue2: JSC__JSValue) void;
 pub extern fn JSC__JSInternalPromise__resolvedPromise(arg0: *bindings.JSGlobalObject, JSValue1: JSC__JSValue) [*c]bindings.JSInternalPromise;
 pub extern fn JSC__JSInternalPromise__result(arg0: [*c]const JSC__JSInternalPromise, arg1: *bindings.VM) JSC__JSValue;
+pub extern fn JSC__JSInternalPromise__setHandled(arg0: [*c]bindings.JSInternalPromise, arg1: *bindings.VM) void;
 pub extern fn JSC__JSInternalPromise__status(arg0: [*c]const JSC__JSInternalPromise, arg1: *bindings.VM) u32;
 pub extern fn JSC__JSFunction__optimizeSoon(JSValue0: JSC__JSValue) void;
 pub extern fn JSC__JSGlobalObject__bunVM(arg0: *bindings.JSGlobalObject) ?*bindings.VirtualMachine;
diff --git a/src/bun.js/bindings/helpers.h b/src/bun.js/bindings/helpers.h
index 402807f3d..00777c304 100644
--- a/src/bun.js/bindings/helpers.h
+++ b/src/bun.js/bindings/helpers.h
@@ -342,10 +342,10 @@ static const WTF::String toStringStatic(ZigString str)
     }
 
     if (isTaggedUTF16Ptr(str.ptr)) {
-        return WTF::String(WTF::ExternalStringImpl::createStatic(reinterpret_cast<const UChar*>(untag(str.ptr)), str.len));
+        return WTF::String(AtomStringImpl::add(reinterpret_cast<const UChar*>(untag(str.ptr)), str.len));
     }
 
-    return WTF::String(WTF::ExternalStringImpl::createStatic(
+    return WTF::String(AtomStringImpl::add(
         reinterpret_cast<const LChar*>(untag(str.ptr)), str.len));
 }
 
diff --git a/src/bun.js/bindings/napi.cpp b/src/bun.js/bindings/napi.cpp
index a859e3ac5..8fffcc05f 100644
--- a/src/bun.js/bindings/napi.cpp
+++ b/src/bun.js/bindings/napi.cpp
@@ -14,7 +14,7 @@
 #include "wtf/text/StringView.h"
 #include "wtf/text/StringBuilder.h"
 #include "wtf/text/WTFString.h"
-
+#include "BufferEncodingType.h"
 #include "JavaScriptCore/AggregateError.h"
 #include "JavaScriptCore/BytecodeIndex.h"
 #include "JavaScriptCore/CallFrame.h"
@@ -554,7 +554,6 @@ extern "C" napi_status napi_wrap(napi_env env,
 
     auto* globalObject = toJS(env);
     auto& vm = globalObject->vm();
-    
 
     auto* val = jsDynamicCast<NapiPrototype*>(value);
 
@@ -572,7 +571,7 @@ extern "C" napi_status napi_wrap(napi_env env,
     auto clientData = WebCore::clientData(vm);
 
     auto* ref = new NapiRef(globalObject, 1);
-    ref->strongRef.set(globalObject->vm(), value.getObject());    
+    ref->strongRef.set(globalObject->vm(), value.getObject());
 
     if (finalize_cb) {
         ref->finalizer.finalize_cb = finalize_cb;
@@ -816,7 +815,7 @@ extern "C" napi_status napi_create_reference(napi_env env, napi_value value,
         }
     }
 
-    if(object) {
+    if (object) {
         object->napiRef = ref;
     }
 
@@ -1029,7 +1028,26 @@ extern "C" napi_status napi_create_type_error(napi_env env, napi_value code,
 
     auto error = JSC::createTypeError(globalObject, messageValue.toWTFString(globalObject));
     if (codeValue) {
-        error->putDirect(vm, Identifier::fromString(vm, "code"_s), codeValue, 0);
+        error->putDirect(vm, WebCore::builtinNames(vm).codePublicName(), codeValue, 0);
+    }
+
+    *result = reinterpret_cast<napi_value>(JSC::JSValue::encode(error));
+    return napi_ok;
+}
+
+extern "C" napi_status napi_create_error(napi_env env, napi_value code,
+    napi_value msg,
+    napi_value* result)
+{
+    Zig::GlobalObject* globalObject = toJS(env);
+    JSC::VM& vm = globalObject->vm();
+
+    JSC::JSValue codeValue = JSC::JSValue::decode(reinterpret_cast<JSC::EncodedJSValue>(code));
+    JSC::JSValue messageValue = JSC::JSValue::decode(reinterpret_cast<JSC::EncodedJSValue>(msg));
+
+    auto error = JSC::createError(globalObject, messageValue.toWTFString(globalObject));
+    if (codeValue) {
+        error->putDirect(vm, WebCore::builtinNames(vm).codePublicName(), codeValue, 0);
     }
 
     *result = reinterpret_cast<napi_value>(JSC::JSValue::encode(error));
@@ -1170,9 +1188,7 @@ void NapiClass::visitChildrenImpl(JSCell* cell, Visitor& visitor)
 
 DEFINE_VISIT_CHILDREN(NapiClass);
 
-static JSC_DECLARE_HOST_FUNCTION(NapiClass_ConstructorFunction);
-
-static JSC_DEFINE_HOST_FUNCTION(NapiClass_ConstructorFunction,
+JSC_DEFINE_HOST_FUNCTION(NapiClass_ConstructorFunction,
     (JSC::JSGlobalObject * globalObject, JSC::CallFrame* callFrame))
 {
     JSC::VM& vm = globalObject->vm();
@@ -1262,7 +1278,6 @@ NapiClass* NapiClass::create(VM& vm, Zig::GlobalObject* globalObject, const char
 {
     WTF::String name = WTF::String::fromUTF8(utf8name, length).isolatedCopy();
     NativeExecutable* executable = vm.getHostFunction(NapiClass_ConstructorFunction, ImplementationVisibility::Public, NapiClass_ConstructorFunction, name);
-
     Structure* structure = globalObject->NapiClassStructure();
     NapiClass* napiClass = new (NotNull, allocateCell<NapiClass>(vm)) NapiClass(vm, executable, globalObject, structure);
     napiClass->finishCreation(vm, executable, length, name, constructor, data, property_count, properties);
@@ -1474,7 +1489,85 @@ extern "C" napi_status napi_get_property_names(napi_env env, napi_value object,
     return napi_ok;
 }
 
-extern "C" napi_status napi_create_object(napi_env env, napi_value* result){
+extern "C" napi_status napi_get_value_string_utf8(napi_env env,
+    napi_value napiValue, char* buf,
+    size_t bufsize,
+    size_t* writtenPtr)
+{
+    JSGlobalObject* globalObject = toJS(env);
+    JSC::VM& vm = globalObject->vm();
+
+    JSValue jsValue = toJS(napiValue);
+    if (!jsValue || !jsValue.isString()) {
+        return napi_string_expected;
+    }
+
+    JSString* jsString = jsValue.toStringOrNull(globalObject);
+    if (UNLIKELY(!jsString)) {
+        return napi_generic_failure;
+    }
+
+    size_t length = jsString->length();
+    auto viewWithUnderlyingString = jsString->viewWithUnderlyingString(globalObject);
+    auto view = viewWithUnderlyingString.view;
+
+    if (buf == nullptr) {
+        if (writtenPtr != nullptr) {
+            if (view.is8Bit()) {
+                *writtenPtr = Bun__encoding__byteLengthLatin1(view.characters8(), length, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+            } else {
+                *writtenPtr = Bun__encoding__byteLengthUTF16(view.characters16(), length, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+            }
+        }
+
+        return napi_ok;
+    }
+
+    if (bufsize == NAPI_AUTO_LENGTH) {
+        bufsize = strlen(buf);
+    }
+
+    size_t written;
+    if (view.is8Bit()) {
+        written = Bun__encoding__writeLatin1(view.characters8(), view.length(), reinterpret_cast<unsigned char*>(buf), bufsize, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+    } else {
+        written = Bun__encoding__writeUTF16(view.characters16(), view.length(), reinterpret_cast<unsigned char*>(buf), bufsize, static_cast<uint8_t>(WebCore::BufferEncodingType::utf8));
+    }
+
+    if (writtenPtr != nullptr) {
+        *writtenPtr = written;
+    }
+
+    if (written < bufsize) {
+        buf[written] = '\0';
+    }
+
+    return napi_ok;
+}
+
+extern "C" napi_status napi_get_element(napi_env env, napi_value objectValue,
+    uint32_t index, napi_value* result)
+{
+    JSValue jsValue = toJS(objectValue);
+    if (!jsValue || !jsValue.isObject()) {
+        return napi_invalid_arg;
+    }
+
+    JSObject* object = jsValue.getObject();
+
+    auto scope = DECLARE_THROW_SCOPE(object->vm());
+    JSValue element = object->getIndex(toJS(env), index);
+    RETURN_IF_EXCEPTION(scope, napi_generic_failure);
+
+    if (result) {
+        *result = toNapi(element);
+    }
+
+    return napi_ok;
+}
+
+extern "C" napi_status napi_create_object(napi_env env, napi_value* result)
+{
 
     if (UNLIKELY(result == nullptr)) {
         return napi_invalid_arg;
@@ -1520,8 +1613,10 @@ extern "C" napi_status napi_typeof(napi_env env, napi_value val,
 
     JSC::JSValue value = toJS(val);
 
-    if (UNLIKELY(value.isEmpty())) {
-        return napi_invalid_arg;
+    if (value.isEmpty()) {
+        // This can happen
+        *result = napi_undefined;
+        return napi_ok;
     }
 
     if (value.isCell()) {
@@ -1560,17 +1655,18 @@ extern "C" napi_status napi_typeof(napi_env env, napi_value val,
             *result = napi_object;
             return napi_ok;
 
-        default:
-            if (cell->isObject()) {
-                *result = napi_object;
+        default: {
+            if (cell->isCallable() || cell->isConstructor()) {
+                *result = napi_function;
                 return napi_ok;
             }
 
-            if (cell->isCallable() || cell->isConstructor()) {
-                *result = napi_function;
+            if (cell->isObject()) {
+                *result = napi_object;
                 return napi_ok;
             }
         }
+        }
     }
 
     if (value.isNumber()) {
diff --git a/src/bun.js/bindings/node_util_types.cpp b/src/bun.js/bindings/node_util_types.cpp
index 0c75662cf..f7ae3949e 100644
--- a/src/bun.js/bindings/node_util_types.cpp
+++ b/src/bun.js/bindings/node_util_types.cpp
@@ -322,7 +322,7 @@ void generateNodeUtilTypesSourceCode(JSC::JSGlobalObject* lexicalGlobalObject,
 
     JSC::VM& vm = globalObject->vm();
 
-    JSC::JSObject* defaultObject = constructEmptyObject(globalObject, globalObject->objectPrototype(), 43);
+    JSC::JSObject* defaultObject = constructEmptyObject(globalObject, globalObject->objectPrototype(), 42);
     exportNames.reserveCapacity(43);
     exportValues.ensureCapacity(43);
 
diff --git a/src/bun.js/bindings/shimmer.zig b/src/bun.js/bindings/shimmer.zig
index a90bfab87..3a6242000 100644
--- a/src/bun.js/bindings/shimmer.zig
+++ b/src/bun.js/bindings/shimmer.zig
@@ -163,7 +163,7 @@ pub fn Shimmer(comptime _namespace: []const u8, comptime _name: []const u8, comp
             if (comptime isNullableType(ExpectedReturnType) != isNullableType(ExternReturnType)) {
                 return value.?;
             } else if (comptime (@typeInfo(ExpectedReturnType) == .Enum) and (@typeInfo(ExternReturnType) != .Enum)) {
-                return @intToEnum(ExpectedReturnType, value);
+                return @enumFromInt(ExpectedReturnType, value);
             } else {
                 return value;
             }
diff --git a/src/bun.js/bindings/simdutf.cpp b/src/bun.js/bindings/simdutf.cpp
index ea0d95f42..6d20bcf5e 100644
--- a/src/bun.js/bindings/simdutf.cpp
+++ b/src/bun.js/bindings/simdutf.cpp
@@ -1,8 +1,8 @@
-/* auto-generated on 2023-02-10 14:42:58 -0500. Do not edit! */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf.cpp
+/* auto-generated on 2023-06-21 08:09:45 -0400. Do not edit! */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf.cpp
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=implementation.cpp
 /* begin file src/implementation.cpp */
 #include <initializer_list>
 #include <climits>
@@ -11,22 +11,23 @@
 namespace simdutf {
 namespace {
 
-template <typename T>
-std::string toBinaryString(T b) {
-   std::string binary = "";
-   T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
-   while (mask > 0) {
-    binary += ((b & mask) == 0) ? '0' : '1';
-    mask >>= 1;
-  }
-  return binary;
+template<typename T>
+std::string toBinaryString(T b)
+{
+    std::string binary = "";
+    T mask = T(1) << (sizeof(T) * CHAR_BIT - 1);
+    while (mask > 0) {
+        binary += ((b & mask) == 0) ? '0' : '1';
+        mask >>= 1;
+    }
+    return binary;
 }
 }
 }
 
 // Implementations
 // The best choice should always come first!
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64.h
 /* begin file src/simdutf/arm64.h */
 #ifndef SIMDUTF_ARM64_H
 #define SIMDUTF_ARM64_H
@@ -35,13 +36,10 @@ std::string toBinaryString(T b) {
 #error "arm64.h must be included before fallback.h"
 #endif
 
-
 #ifndef SIMDUTF_IMPLEMENTATION_ARM64
 #define SIMDUTF_IMPLEMENTATION_ARM64 (SIMDUTF_IS_ARM64)
 #endif
-#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64 && SIMDUTF_IS_ARM64
-
-
+#define SIMDUTF_CAN_ALWAYS_RUN_ARM64 SIMDUTF_IMPLEMENTATION_ARM64&& SIMDUTF_IS_ARM64
 
 #if SIMDUTF_IMPLEMENTATION_ARM64
 
@@ -53,12 +51,11 @@ namespace arm64 {
 } // namespace arm64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/implementation.h
 /* begin file src/simdutf/arm64/implementation.h */
 #ifndef SIMDUTF_ARM64_IMPLEMENTATION_H
 #define SIMDUTF_ARM64_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace arm64 {
 
@@ -68,60 +65,85 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation("arm64", "ARM NEON", internal::instruction_set::NEON)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace arm64
@@ -130,26 +152,25 @@ public:
 #endif // SIMDUTF_ARM64_IMPLEMENTATION_H
 /* end file src/simdutf/arm64/implementation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/begin.h
 /* begin file src/simdutf/arm64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
 // #define SIMDUTF_IMPLEMENTATION arm64
 /* end file src/simdutf/arm64/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/intrinsics.h
 /* begin file src/simdutf/arm64/intrinsics.h */
 #ifndef SIMDUTF_ARM64_INTRINSICS_H
 #define SIMDUTF_ARM64_INTRINSICS_H
 
-
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <arm_neon.h>
 
 #endif //  SIMDUTF_ARM64_INTRINSICS_H
 /* end file src/simdutf/arm64/intrinsics.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/bitmanipulation.h
 /* begin file src/simdutf/arm64/bitmanipulation.h */
 #ifndef SIMDUTF_ARM64_BITMANIPULATION_H
 #define SIMDUTF_ARM64_BITMANIPULATION_H
@@ -159,8 +180,9 @@ namespace arm64 {
 namespace {
 
 /* result might be undefined when input_num is zero */
-simdutf_really_inline int count_ones(uint64_t input_num) {
-   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+simdutf_really_inline int count_ones(uint64_t input_num)
+{
+    return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
 }
 
 } // unnamed namespace
@@ -169,14 +191,13 @@ simdutf_really_inline int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_ARM64_BITMANIPULATION_H
 /* end file src/simdutf/arm64/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/simd.h
 /* begin file src/simdutf/arm64/simd.h */
 #ifndef SIMDUTF_ARM64_SIMD_H
 #define SIMDUTF_ARM64_SIMD_H
 
 #include <type_traits>
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
@@ -186,7 +207,6 @@ namespace simd {
 namespace {
 // Start of private section with Visual Studio workaround
 
-
 /**
  * make_uint8x16_t initializes a SIMD register (uint8x16_t).
  * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
@@ -198,130 +218,138 @@ namespace {
  * You should not use this function except for compile-time constants:
  * it is not efficient.
  */
-simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
-                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8,
-                                         uint8_t x9,  uint8_t x10, uint8_t x11, uint8_t x12,
-                                         uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
-  // Doing a load like so end ups generating worse code.
-  // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
-  //                     x9, x10,x11,x12,x13,x14,x15,x16};
-  // return vld1q_u8(array);
-  uint8x16_t x{};
-  // incredibly, Visual Studio does not allow x[0] = x1
-  x = vsetq_lane_u8(x1, x, 0);
-  x = vsetq_lane_u8(x2, x, 1);
-  x = vsetq_lane_u8(x3, x, 2);
-  x = vsetq_lane_u8(x4, x, 3);
-  x = vsetq_lane_u8(x5, x, 4);
-  x = vsetq_lane_u8(x6, x, 5);
-  x = vsetq_lane_u8(x7, x, 6);
-  x = vsetq_lane_u8(x8, x, 7);
-  x = vsetq_lane_u8(x9, x, 8);
-  x = vsetq_lane_u8(x10, x, 9);
-  x = vsetq_lane_u8(x11, x, 10);
-  x = vsetq_lane_u8(x12, x, 11);
-  x = vsetq_lane_u8(x13, x, 12);
-  x = vsetq_lane_u8(x14, x, 13);
-  x = vsetq_lane_u8(x15, x, 14);
-  x = vsetq_lane_u8(x16, x, 15);
-  return x;
+simdutf_really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
+    uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
+    uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
+    uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16)
+{
+    // Doing a load like so end ups generating worse code.
+    // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+    //                     x9, x10,x11,x12,x13,x14,x15,x16};
+    // return vld1q_u8(array);
+    uint8x16_t x {};
+    // incredibly, Visual Studio does not allow x[0] = x1
+    x = vsetq_lane_u8(x1, x, 0);
+    x = vsetq_lane_u8(x2, x, 1);
+    x = vsetq_lane_u8(x3, x, 2);
+    x = vsetq_lane_u8(x4, x, 3);
+    x = vsetq_lane_u8(x5, x, 4);
+    x = vsetq_lane_u8(x6, x, 5);
+    x = vsetq_lane_u8(x7, x, 6);
+    x = vsetq_lane_u8(x8, x, 7);
+    x = vsetq_lane_u8(x9, x, 8);
+    x = vsetq_lane_u8(x10, x, 9);
+    x = vsetq_lane_u8(x11, x, 10);
+    x = vsetq_lane_u8(x12, x, 11);
+    x = vsetq_lane_u8(x13, x, 12);
+    x = vsetq_lane_u8(x14, x, 13);
+    x = vsetq_lane_u8(x15, x, 14);
+    x = vsetq_lane_u8(x16, x, 15);
+    return x;
 }
 
 // We have to do the same work for make_int8x16_t
-simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1,  int8_t x2,  int8_t x3,  int8_t x4,
-                                       int8_t x5,  int8_t x6,  int8_t x7,  int8_t x8,
-                                       int8_t x9,  int8_t x10, int8_t x11, int8_t x12,
-                                       int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
-  // Doing a load like so end ups generating worse code.
-  // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
-  //                     x9, x10,x11,x12,x13,x14,x15,x16};
-  // return vld1q_s8(array);
-  int8x16_t x{};
-  // incredibly, Visual Studio does not allow x[0] = x1
-  x = vsetq_lane_s8(x1, x, 0);
-  x = vsetq_lane_s8(x2, x, 1);
-  x = vsetq_lane_s8(x3, x, 2);
-  x = vsetq_lane_s8(x4, x, 3);
-  x = vsetq_lane_s8(x5, x, 4);
-  x = vsetq_lane_s8(x6, x, 5);
-  x = vsetq_lane_s8(x7, x, 6);
-  x = vsetq_lane_s8(x8, x, 7);
-  x = vsetq_lane_s8(x9, x, 8);
-  x = vsetq_lane_s8(x10, x, 9);
-  x = vsetq_lane_s8(x11, x, 10);
-  x = vsetq_lane_s8(x12, x, 11);
-  x = vsetq_lane_s8(x13, x, 12);
-  x = vsetq_lane_s8(x14, x, 13);
-  x = vsetq_lane_s8(x15, x, 14);
-  x = vsetq_lane_s8(x16, x, 15);
-  return x;
-}
-
-simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1,  uint8_t x2,  uint8_t x3,  uint8_t x4,
-                                         uint8_t x5,  uint8_t x6,  uint8_t x7,  uint8_t x8) {
-  uint8x8_t x{};
-  x = vset_lane_u8(x1, x, 0);
-  x = vset_lane_u8(x2, x, 1);
-  x = vset_lane_u8(x3, x, 2);
-  x = vset_lane_u8(x4, x, 3);
-  x = vset_lane_u8(x5, x, 4);
-  x = vset_lane_u8(x6, x, 5);
-  x = vset_lane_u8(x7, x, 6);
-  x = vset_lane_u8(x8, x, 7);
-  return x;
-}
-
-simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1,  uint16_t x2,  uint16_t x3,  uint16_t x4,
-                                       uint16_t x5,  uint16_t x6,  uint16_t x7,  uint16_t x8) {
-  uint16x8_t x{};
-  x = vsetq_lane_u16(x1, x, 0);
-  x = vsetq_lane_u16(x2, x, 1);
-  x = vsetq_lane_u16(x3, x, 2);
-  x = vsetq_lane_u16(x4, x, 3);
-  x = vsetq_lane_u16(x5, x, 4);
-  x = vsetq_lane_u16(x6, x, 5);
-  x = vsetq_lane_u16(x7, x, 6);
-  x = vsetq_lane_u16(x8, x, 7);;
-  return x;
-}
-
-simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t x3,  int16_t x4,
-                                       int16_t x5,  int16_t x6,  int16_t x7,  int16_t x8) {
-  uint16x8_t x{};
-  x = vsetq_lane_s16(x1, x, 0);
-  x = vsetq_lane_s16(x2, x, 1);
-  x = vsetq_lane_s16(x3, x, 2);
-  x = vsetq_lane_s16(x4, x, 3);
-  x = vsetq_lane_s16(x5, x, 4);
-  x = vsetq_lane_s16(x6, x, 5);
-  x = vsetq_lane_s16(x7, x, 6);
-  x = vsetq_lane_s16(x8, x, 7);;
-  return x;
+simdutf_really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
+    int8_t x5, int8_t x6, int8_t x7, int8_t x8,
+    int8_t x9, int8_t x10, int8_t x11, int8_t x12,
+    int8_t x13, int8_t x14, int8_t x15, int8_t x16)
+{
+    // Doing a load like so end ups generating worse code.
+    // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+    //                     x9, x10,x11,x12,x13,x14,x15,x16};
+    // return vld1q_s8(array);
+    int8x16_t x {};
+    // incredibly, Visual Studio does not allow x[0] = x1
+    x = vsetq_lane_s8(x1, x, 0);
+    x = vsetq_lane_s8(x2, x, 1);
+    x = vsetq_lane_s8(x3, x, 2);
+    x = vsetq_lane_s8(x4, x, 3);
+    x = vsetq_lane_s8(x5, x, 4);
+    x = vsetq_lane_s8(x6, x, 5);
+    x = vsetq_lane_s8(x7, x, 6);
+    x = vsetq_lane_s8(x8, x, 7);
+    x = vsetq_lane_s8(x9, x, 8);
+    x = vsetq_lane_s8(x10, x, 9);
+    x = vsetq_lane_s8(x11, x, 10);
+    x = vsetq_lane_s8(x12, x, 11);
+    x = vsetq_lane_s8(x13, x, 12);
+    x = vsetq_lane_s8(x14, x, 13);
+    x = vsetq_lane_s8(x15, x, 14);
+    x = vsetq_lane_s8(x16, x, 15);
+    return x;
+}
+
+simdutf_really_inline uint8x8_t make_uint8x8_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
+    uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8)
+{
+    uint8x8_t x {};
+    x = vset_lane_u8(x1, x, 0);
+    x = vset_lane_u8(x2, x, 1);
+    x = vset_lane_u8(x3, x, 2);
+    x = vset_lane_u8(x4, x, 3);
+    x = vset_lane_u8(x5, x, 4);
+    x = vset_lane_u8(x6, x, 5);
+    x = vset_lane_u8(x7, x, 6);
+    x = vset_lane_u8(x8, x, 7);
+    return x;
+}
+
+simdutf_really_inline uint16x8_t make_uint16x8_t(uint16_t x1, uint16_t x2, uint16_t x3, uint16_t x4,
+    uint16_t x5, uint16_t x6, uint16_t x7, uint16_t x8)
+{
+    uint16x8_t x {};
+    x = vsetq_lane_u16(x1, x, 0);
+    x = vsetq_lane_u16(x2, x, 1);
+    x = vsetq_lane_u16(x3, x, 2);
+    x = vsetq_lane_u16(x4, x, 3);
+    x = vsetq_lane_u16(x5, x, 4);
+    x = vsetq_lane_u16(x6, x, 5);
+    x = vsetq_lane_u16(x7, x, 6);
+    x = vsetq_lane_u16(x8, x, 7);
+    ;
+    return x;
+}
+
+simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1, int16_t x2, int16_t x3, int16_t x4,
+    int16_t x5, int16_t x6, int16_t x7, int16_t x8)
+{
+    uint16x8_t x {};
+    x = vsetq_lane_s16(x1, x, 0);
+    x = vsetq_lane_s16(x2, x, 1);
+    x = vsetq_lane_s16(x3, x, 2);
+    x = vsetq_lane_s16(x4, x, 3);
+    x = vsetq_lane_s16(x5, x, 4);
+    x = vsetq_lane_s16(x6, x, 5);
+    x = vsetq_lane_s16(x7, x, 6);
+    x = vsetq_lane_s16(x8, x, 7);
+    ;
+    return x;
 }
 
-
 // End of private section with Visual Studio workaround
 } // namespace
 #endif // SIMDUTF_REGULAR_VISUAL_STUDIO
 
+template<typename T>
+struct simd8;
 
-  template<typename T>
-  struct simd8;
-
-  //
-  // Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
-  //
-  template<typename T, typename Mask=simd8<bool>>
-  struct base_u8 {
+//
+// Base class of simd8<uint8_t> and simd8<bool>, both of which use uint8x16_t internally.
+//
+template<typename T, typename Mask = simd8<bool>>
+struct base_u8 {
     uint8x16_t value;
     static const int SIZE = sizeof(value);
 
     // Conversion from/to SIMD register
-    simdutf_really_inline base_u8(const uint8x16_t _value) : value(_value) {}
+    simdutf_really_inline base_u8(const uint8x16_t _value)
+        : value(_value)
+    {
+    }
     simdutf_really_inline operator const uint8x16_t&() const { return this->value; }
     simdutf_really_inline operator uint8x16_t&() { return this->value; }
-    simdutf_really_inline T first() const { return vgetq_lane_u8(*this,0); }
-    simdutf_really_inline T last() const { return vgetq_lane_u8(*this,15); }
+    simdutf_really_inline T first() const { return vgetq_lane_u8(*this, 0); }
+    simdutf_really_inline T last() const { return vgetq_lane_u8(*this, 15); }
 
     // Bit operations
     simdutf_really_inline simd8<T> operator|(const simd8<T> other) const { return vorrq_u8(*this, other); }
@@ -329,48 +357,74 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     simdutf_really_inline simd8<T> operator^(const simd8<T> other) const { return veorq_u8(*this, other); }
     simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const { return vbicq_u8(*this, other); }
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other) { auto this_cast = static_cast<simd8<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+    simdutf_really_inline simd8<T>& operator|=(const simd8<T> other)
+    {
+        auto this_cast = static_cast<simd8<T>*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd8<T>& operator&=(const simd8<T> other)
+    {
+        auto this_cast = static_cast<simd8<T>*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd8<T>& operator^=(const simd8<T> other)
+    {
+        auto this_cast = static_cast<simd8<T>*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
 
-    simdutf_really_inline Mask operator==(const simd8<T> other) const { return vceqq_u8(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return vceqq_u8(lhs, rhs); }
 
-    template<int N=1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return vextq_u8(prev_chunk, *this, 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+    {
+        return vextq_u8(prev_chunk, *this, 16 - N);
     }
-  };
+};
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base_u8<bool> {
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd8<bool> : base_u8<bool> {
     typedef uint16_t bitmask_t;
     typedef uint32_t bitmask2_t;
 
     static simdutf_really_inline simd8<bool> splat(bool _value) { return vmovq_n_u8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<bool>(_value) {}
+    simdutf_really_inline simd8(const uint8x16_t _value)
+        : base_u8<bool>(_value)
+    {
+    }
     // False constructor
-    simdutf_really_inline simd8() : simd8(vdupq_n_u8(0)) {}
+    simdutf_really_inline simd8()
+        : simd8(vdupq_n_u8(0))
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(bool _value)
+        : simd8(splat(_value))
+    {
+    }
     simdutf_really_inline void store(uint8_t dst[16]) const { return vst1q_u8(dst, *this); }
 
     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
     // purposes (cutting it down to uint16_t costs performance in some compilers).
-    simdutf_really_inline uint32_t to_bitmask() const {
+    simdutf_really_inline uint32_t to_bitmask() const
+    {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+        const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-      const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+        const uint8x16_t bit_mask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 };
 #endif
-      auto minput = *this & bit_mask;
-      uint8x16_t tmp = vpaddq_u8(minput, minput);
-      tmp = vpaddq_u8(tmp, tmp);
-      tmp = vpaddq_u8(tmp, tmp);
-      return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
+        auto minput = *this & bit_mask;
+        uint8x16_t tmp = vpaddq_u8(minput, minput);
+        tmp = vpaddq_u8(tmp, tmp);
+        tmp = vpaddq_u8(tmp, tmp);
+        return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
     }
 
     // Returns 4-bit out of each byte, alternating between the high 4 bits and low bits
@@ -378,58 +432,70 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // This method is expected to be faster than none() and is equivalent
     // when the vector register is the result of a comparison, with byte
     // values 0xff and 0x00.
-    simdutf_really_inline uint64_t to_bitmask64() const {
-      return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
+    simdutf_really_inline uint64_t to_bitmask64() const
+    {
+        return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(*this), 4)), 0);
     }
 
     simdutf_really_inline bool any() const { return vmaxvq_u8(*this) != 0; }
     simdutf_really_inline bool none() const { return vmaxvq_u8(*this) == 0; }
     simdutf_really_inline bool all() const { return vminvq_u8(*this) == 0xFF; }
+};
 
-
-  };
-
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base_u8<uint8_t> {
+// Unsigned bytes
+template<>
+struct simd8<uint8_t> : base_u8<uint8_t> {
     static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) { return vmovq_n_u8(_value); }
     static simdutf_really_inline simd8<uint8_t> zero() { return vdupq_n_u8(0); }
     static simdutf_really_inline simd8<uint8_t> load(const uint8_t* values) { return vld1q_u8(values); }
-    simdutf_really_inline simd8(const uint8x16_t _value) : base_u8<uint8_t>(_value) {}
+    simdutf_really_inline simd8(const uint8x16_t _value)
+        : base_u8<uint8_t>(_value)
+    {
+    }
     // Zero constructor
-    simdutf_really_inline simd8() : simd8(zero()) {}
+    simdutf_really_inline simd8()
+        : simd8(zero())
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint8_t values[16])
+        : simd8(load(values))
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Member-by-member initialization
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(make_uint8x16_t(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        : simd8(make_uint8x16_t(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
 #else
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(uint8x16_t{
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    }) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        : simd8(uint8x16_t {
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15 })
+    {
+    }
 #endif
 
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+    {
+        return simd8<uint8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
     // Store to array
@@ -442,8 +508,16 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<uint8_t> operator+(const simd8<uint8_t> other) const { return vaddq_u8(*this, other); }
     simdutf_really_inline simd8<uint8_t> operator-(const simd8<uint8_t> other) const { return vsubq_u8(*this, other); }
-    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other) { *this = *this + other; return *this; }
-    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other) { *this = *this - other; return *this; }
+    simdutf_really_inline simd8<uint8_t>& operator+=(const simd8<uint8_t> other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+    simdutf_really_inline simd8<uint8_t>& operator-=(const simd8<uint8_t> other)
+    {
+        *this = *this - other;
+        return *this;
+    }
 
     // Order-specific operations
     simdutf_really_inline uint8_t max_val() const { return vmaxvq_u8(*this); }
@@ -472,100 +546,116 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return lookup_table.apply_lookup_16_to(*this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return lookup_table.apply_lookup_16_to(*this);
     }
 
-
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
     }
 
     template<typename T>
-    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const {
-      return vqtbl1q_u8(*this, simd8<uint8_t>(original));
+    simdutf_really_inline simd8<uint8_t> apply_lookup_16_to(const simd8<T> original) const
+    {
+        return vqtbl1q_u8(*this, simd8<uint8_t>(original));
     }
-  };
+};
 
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> {
+// Signed bytes
+template<>
+struct simd8<int8_t> {
     int8x16_t value;
 
     static simdutf_really_inline simd8<int8_t> splat(int8_t _value) { return vmovq_n_s8(_value); }
     static simdutf_really_inline simd8<int8_t> zero() { return vdupq_n_s8(0); }
     static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) { return vld1q_s8(values); }
-    template <endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
-      uint16x8_t first = vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)));
-      uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
-      if (!match_system(big_endian)) {
-        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        #else
-        const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-        #endif
-        first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
-        second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
-      }
-      vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
-      vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
-      vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value))))));
-      vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8 (vreinterpretq_u8_s8(this->value)))));
-      vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
-      vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
+    template<endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* p) const
+    {
+        uint16x8_t first = vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value)));
+        uint16x8_t second = vmovl_high_u8(vreinterpretq_u8_s8(this->value));
+        if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(first), swap));
+            second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(second), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(p), first);
+        vst1q_u16(reinterpret_cast<uint16_t*>(p + 8), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* p) const
+    {
+        vst1q_u32(reinterpret_cast<uint32_t*>(p), vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value))))));
+        vst1q_u32(reinterpret_cast<uint32_t*>(p + 4), vmovl_high_u16(vmovl_u8(vget_low_u8(vreinterpretq_u8_s8(this->value)))));
+        vst1q_u32(reinterpret_cast<uint32_t*>(p + 8), vmovl_u16(vget_low_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value)))));
+        vst1q_u32(reinterpret_cast<uint32_t*>(p + 12), vmovl_high_u16(vmovl_high_u8(vreinterpretq_u8_s8(this->value))));
     }
     // Conversion from/to SIMD register
-    simdutf_really_inline simd8(const int8x16_t _value) : value{_value} {}
+    simdutf_really_inline simd8(const int8x16_t _value)
+        : value { _value }
+    {
+    }
     simdutf_really_inline operator const int8x16_t&() const { return this->value; }
     simdutf_really_inline operator const uint8x16_t() const { return vreinterpretq_u8_s8(this->value); }
     simdutf_really_inline operator int8x16_t&() { return this->value; }
 
     // Zero constructor
-    simdutf_really_inline simd8() : simd8(zero()) {}
+    simdutf_really_inline simd8()
+        : simd8(zero())
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const int8_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(make_int8x16_t(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8(make_int8x16_t(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
 #else
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(int8x16_t{
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    }) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8(int8x16_t {
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15 })
+    {
+    }
 #endif
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
     // Store to array
@@ -576,9 +666,15 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // In theory, we could check this occurrence with std::same_as and std::enabled_if but it is C++14
     // and relatively ugly and hard to read.
 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
-    simdutf_really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
+    simdutf_really_inline explicit simd8(const uint8x16_t other)
+        : simd8(vreinterpretq_s8_u8(other))
+    {
+    }
 #endif
-    simdutf_really_inline operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
+    simdutf_really_inline operator simd8<uint8_t>() const
+    {
+        return vreinterpretq_u8_s8(this->value);
+    }
 
     simdutf_really_inline simd8<int8_t> operator|(const simd8<int8_t> other) const { return vorrq_s8(value, other.value); }
     simdutf_really_inline simd8<int8_t> operator&(const simd8<int8_t> other) const { return vandq_s8(value, other.value); }
@@ -588,8 +684,16 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     // Math
     simdutf_really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(value, other.value); }
     simdutf_really_inline simd8<int8_t> operator-(const simd8<int8_t> other) const { return vsubq_s8(value, other.value); }
-    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other) { *this = *this + other; return *this; }
-    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other) { *this = *this - other; return *this; }
+    simdutf_really_inline simd8<int8_t>& operator+=(const simd8<int8_t> other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+    simdutf_really_inline simd8<int8_t>& operator-=(const simd8<int8_t> other)
+    {
+        *this = *this - other;
+        return *this;
+    }
 
     simdutf_really_inline int8_t max_val() const { return vmaxvq_s8(value); }
     simdutf_really_inline int8_t min_val() const { return vminvq_s8(value); }
@@ -602,38 +706,41 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return vcltq_s8(value, other.value); }
     simdutf_really_inline simd8<bool> operator==(const simd8<int8_t> other) const { return vceqq_s8(value, other.value); }
 
-    template<int N=1>
-    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const {
-      return vextq_s8(prev_chunk, *this, 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<int8_t> prev(const simd8<int8_t> prev_chunk) const
+    {
+        return vextq_s8(prev_chunk, *this, 16 - N);
     }
 
     // Perform a lookup assuming no value is larger than 16
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return lookup_table.apply_lookup_16_to(*this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return lookup_table.apply_lookup_16_to(*this);
     }
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
     }
 
     template<typename T>
-    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original) {
-      return vqtbl1q_s8(*this, simd8<uint8_t>(original));
+    simdutf_really_inline simd8<int8_t> apply_lookup_16_to(const simd8<T> original)
+    {
+        return vqtbl1q_s8(*this, simd8<uint8_t>(original));
     }
-  };
+};
 
-  template<typename T>
-  struct simd8x64 {
+template<typename T>
+struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -642,159 +749,181 @@ simdutf_really_inline int16x8_t make_int16x8_t(int16_t x1,  int16_t x2,  int16_t
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
-
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
+    {
     }
 
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    }
 
-    simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      this->chunks[2] |= other.chunks[2];
-      this->chunks[3] |= other.chunks[3];
-      return *this;
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        this->chunks[2] |= other.chunks[2];
+        this->chunks[3] |= other.chunks[3];
+        return *this;
     }
 
-    simdutf_really_inline simd8<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return reduce_or().is_ascii();
     }
 
-    template <endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
+    template<endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 3);
     }
 
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t bit_mask = make_uint8x16_t(
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      );
+        const uint8x16_t bit_mask = make_uint8x16_t(
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-      const uint8x16_t bit_mask = {
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      };
+        const uint8x16_t bit_mask = {
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        };
 #endif
-      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-      uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
-      uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
-      sum0 = vpaddq_u8(sum0, sum1);
-      sum0 = vpaddq_u8(sum0, sum0);
-      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return  simd8x64<bool>(
-      this->chunks[0] == mask,
-      this->chunks[1] == mask,
-      this->chunks[2] == mask,
-      this->chunks[3] == mask
-    ).to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t lteq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return  simd8x64<bool>(
-      this->chunks[0] <= mask,
-      this->chunks[1] <= mask,
-      this->chunks[2] <= mask,
-      this->chunks[3] <= mask
-    ).to_bitmask();
-  }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-      return  simd8x64<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask,
-        this->chunks[2] > mask,
-        this->chunks[3] > mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask,
-        this->chunks[2] >= mask,
-        this->chunks[3] >= mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
-        simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
-        simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
-        simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask
-      ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/simd16-inl.h
+        // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+        uint8x16_t sum0 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[0]), bit_mask), vandq_u8(uint8x16_t(this->chunks[1]), bit_mask));
+        uint8x16_t sum1 = vpaddq_u8(vandq_u8(uint8x16_t(this->chunks[2]), bit_mask), vandq_u8(uint8x16_t(this->chunks[3]), bit_mask));
+        sum0 = vpaddq_u8(sum0, sum1);
+        sum0 = vpaddq_u8(sum0, sum0);
+        return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+        return simd8x64<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask,
+            this->chunks[2] > mask,
+            this->chunks[3] > mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask,
+            this->chunks[2] >= mask,
+            this->chunks[3] >= mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            simd8<uint8_t>(uint8x16_t(this->chunks[0])) >= mask,
+            simd8<uint8_t>(uint8x16_t(this->chunks[1])) >= mask,
+            simd8<uint8_t>(uint8x16_t(this->chunks[2])) >= mask,
+            simd8<uint8_t>(uint8x16_t(this->chunks[3])) >= mask)
+            .to_bitmask();
+    }
+}; // struct simd8x64<T>
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/simd16-inl.h
 /* begin file src/simdutf/arm64/simd16-inl.h */
 template<typename T>
 struct simd16;
 
-  template<typename T, typename Mask=simd16<bool>>
-  struct base_u16 {
+template<typename T, typename Mask = simd16<bool>>
+struct base_u16 {
     uint16x8_t value;
     static const int SIZE = sizeof(value);
 
     // Conversion from/to SIMD register
     simdutf_really_inline base_u16() = default;
-    simdutf_really_inline base_u16(const uint16x8_t _value) : value(_value) {}
+    simdutf_really_inline base_u16(const uint16x8_t _value)
+        : value(_value)
+    {
+    }
     simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
     simdutf_really_inline operator uint16x8_t&() { return this->value; }
     // Bit operations
@@ -803,167 +932,244 @@ struct simd16;
     simdutf_really_inline simd16<T> operator^(const simd16<T> other) const { return veorq_u16(*this, other); }
     simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const { return vbicq_u16(*this, other); }
     simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other) { auto this_cast = static_cast<simd16<T>*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
+    simdutf_really_inline simd16<T>& operator|=(const simd16<T> other)
+    {
+        auto this_cast = static_cast<simd16<T>*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd16<T>& operator&=(const simd16<T> other)
+    {
+        auto this_cast = static_cast<simd16<T>*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline simd16<T>& operator^=(const simd16<T> other)
+    {
+        auto this_cast = static_cast<simd16<T>*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
 
-    simdutf_really_inline Mask operator==(const simd16<T> other) const { return vceqq_u16(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return vceqq_u16(lhs, rhs); }
 
-    template<int N=1>
-    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-      return vextq_u18(prev_chunk, *this, 8 - N);
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return vextq_u18(prev_chunk, *this, 8 - N);
     }
-  };
-
-template<typename T, typename Mask=simd16<bool>>
-struct base16: base_u16<T> {
-  typedef uint16_t bitmask_t;
-  typedef uint32_t bitmask2_t;
+};
 
-  simdutf_really_inline base16() : base_u16<T>() {}
-  simdutf_really_inline base16(const uint16x8_t _value) : base_u16<T>(_value) {}
-  template <typename Pointer>
-  simdutf_really_inline base16(const Pointer* ptr) : base16(vld1q_u16(ptr)) {}
+template<typename T, typename Mask = simd16<bool>>
+struct base16 : base_u16<T> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
 
-  simdutf_really_inline Mask operator==(const simd16<T> other) const { return vceqq_u16(*this, other); }
+    simdutf_really_inline base16()
+        : base_u16<T>()
+    {
+    }
+    simdutf_really_inline base16(const uint16x8_t _value)
+        : base_u16<T>(_value)
+    {
+    }
+    template<typename Pointer>
+    simdutf_really_inline base16(const Pointer* ptr)
+        : base16(vld1q_u16(ptr))
+    {
+    }
 
-  static const int SIZE = sizeof(base_u16<T>::value);
+    static const int SIZE = sizeof(base_u16<T>::value);
 
-  template<int N=1>
-  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-    return vextq_u18(prev_chunk, *this, 8 - N);
-  }
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return vextq_u18(prev_chunk, *this, 8 - N);
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool>: base16<bool> {
-  static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
-
-  simdutf_really_inline simd16<bool>() : base16() {}
-  simdutf_really_inline simd16<bool>(const uint16x8_t _value) : base16<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+struct simd16<bool> : base16<bool> {
+    static simdutf_really_inline simd16<bool> splat(bool _value) { return vmovq_n_u16(uint16_t(-(!!_value))); }
 
+    simdutf_really_inline simd16<bool>()
+        : base16()
+    {
+    }
+    simdutf_really_inline simd16<bool>(const uint16x8_t _value)
+        : base16<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16<bool>(bool _value)
+        : base16<bool>(splat(_value))
+    {
+    }
 };
 
 template<typename T>
-struct base16_numeric: base16<T> {
-  static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
-  static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
-  static simdutf_really_inline simd16<T> load(const T values[8]) {
-    return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
-  }
-
-  simdutf_really_inline base16_numeric() : base16<T>() {}
-  simdutf_really_inline base16_numeric(const uint16x8_t _value) : base16<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
-  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
-  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
-  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+struct base16_numeric : base16<T> {
+    static simdutf_really_inline simd16<T> splat(T _value) { return vmovq_n_u16(_value); }
+    static simdutf_really_inline simd16<T> zero() { return vdupq_n_u16(0); }
+    static simdutf_really_inline simd16<T> load(const T values[8])
+    {
+        return vld1q_u16(reinterpret_cast<const uint16_t*>(values));
+    }
+
+    simdutf_really_inline base16_numeric()
+        : base16<T>()
+    {
+    }
+    simdutf_really_inline base16_numeric(const uint16x8_t _value)
+        : base16<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[8]) const { return vst1q_u16(dst, *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return vaddq_u8(*this, other); }
+    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return vsubq_u8(*this, other); }
+    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd16<T>*>(this);
+    }
+    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd16<T>*>(this);
+    }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+    simdutf_really_inline simd16()
+        : base16_numeric<int16_t>()
+    {
+    }
 #ifndef SIMDUTF_REGULAR_VISUAL_STUDIO
-  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<int16_t>(_value) {}
+    simdutf_really_inline simd16(const uint16x8_t _value)
+        : base16_numeric<int16_t>(_value)
+    {
+    }
 #endif
-  simdutf_really_inline simd16(const int16x8_t _value) : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value)) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
-  simdutf_really_inline operator simd16<uint16_t>() const;
-  simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
-  simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
-
-  simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
-  simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
-  // Order-sensitive comparisons
-  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
-};
-
+    simdutf_really_inline simd16(const int16x8_t _value)
+        : base16_numeric<int16_t>(vreinterpretq_u16_s16(_value))
+    {
+    }
 
+    // Splat constructor
+    simdutf_really_inline simd16(int16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const int16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const int16_t*>(values)))
+    {
+    }
+    simdutf_really_inline operator simd16<uint16_t>() const;
+    simdutf_really_inline operator const uint16x8_t&() const { return this->value; }
+    simdutf_really_inline operator const int16x8_t() const { return vreinterpretq_s16_u16(this->value); }
 
+    simdutf_really_inline int16_t max_val() const { return vmaxvq_s16(vreinterpretq_s16_u16(this->value)); }
+    simdutf_really_inline int16_t min_val() const { return vminvq_s16(vreinterpretq_s16_u16(this->value)); }
+    // Order-sensitive comparisons
+    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return vmaxq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return vcgtq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return vcltq_s16(vreinterpretq_s16_u16(this->value), vreinterpretq_s16_u16(other.value)); }
+};
 
 // Unsigned words
 template<>
-struct simd16<uint16_t>: base16_numeric<uint16_t>  {
-  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
-  simdutf_really_inline simd16(const uint16x8_t _value) : base16_numeric<uint16_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
-
-
-  simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
-  simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
-  // Saturated math
-  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
-
-  // Order-specific operations
-  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return  vcgtq_u16(*this, other); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
-
-  // Bit-specific operations
-  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
-
-  // logical operations
-  simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
-
-  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
-  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
-    return vqmovn_high_u16(vqmovn_u16(v0), v1);
-  }
-
-  // Change the endianness
-  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
-    #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    #else
-    const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-    #endif
-    return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
-  }
+struct simd16<uint16_t> : base16_numeric<uint16_t> {
+    simdutf_really_inline simd16()
+        : base16_numeric<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const uint16x8_t _value)
+        : base16_numeric<uint16_t>(_value)
+    {
+    }
+
+    // Splat constructor
+    simdutf_really_inline simd16(uint16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const uint16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
+    {
+    }
+
+    simdutf_really_inline int16_t max_val() const { return vmaxvq_u16(*this); }
+    simdutf_really_inline int16_t min_val() const { return vminvq_u16(*this); }
+    // Saturated math
+    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return vqaddq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return vqsubq_u16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return vmaxq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return vminq_u16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return vcleq_u16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return vcgeq_u16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return vcgtq_u16(*this, other); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return vcltq_u16(*this, other); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(vshrq_n_u16(*this, N)); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(vshlq_n_u16(*this, N)); }
+
+    // logical operations
+    simdutf_really_inline simd16<uint16_t> operator|(const simd16<uint16_t> other) const { return vorrq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> operator&(const simd16<uint16_t> other) const { return vandq_u16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> operator^(const simd16<uint16_t> other) const { return veorq_u16(*this, other); }
+
+    // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
+    {
+        return vqmovn_high_u16(vqmovn_u16(v0), v1);
+    }
+
+    // Change the endianness
+    simdutf_really_inline simd16<uint16_t> swap_bytes() const
+    {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+        const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+        return vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(*this), swap));
+    }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
 
-
-  template<typename T>
-  struct simd16x32 {
+template<typename T>
+struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 4, "ARM kernel should use four registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -972,122 +1178,138 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd16x32(const T* ptr)
+        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T)) }
+    {
+    }
 
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
     }
 
-    simdutf_really_inline simd16<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    simdutf_really_inline simd16<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return reduce_or().is_ascii();
     }
 
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
-      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
-      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
-      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+        this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+        this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t bit_mask = make_uint8x16_t(
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      );
+        const uint8x16_t bit_mask = make_uint8x16_t(
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
 #else
-      const uint8x16_t bit_mask = {
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
-      };
+        const uint8x16_t bit_mask = {
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+            0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+        };
 #endif
-      // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
-      uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
-      uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
-      sum0 = vpaddq_u8(sum0, sum1);
-      sum0 = vpaddq_u8(sum0, sum0);
-      return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
-    }
-
-    simdutf_really_inline void swap_bytes() {
-      this->chunks[0] = this->chunks[0].swap_bytes();
-      this->chunks[1] = this->chunks[1].swap_bytes();
-      this->chunks[2] = this->chunks[2].swap_bytes();
-      this->chunks[3] = this->chunks[3].swap_bytes();
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-    const simd16<T> mask = simd16<T>::splat(m);
-    return  simd16x32<bool>(
-      this->chunks[0] == mask,
-      this->chunks[1] == mask,
-      this->chunks[2] == mask,
-      this->chunks[3] == mask
-    ).to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t lteq(const T m) const {
-    const simd16<T> mask = simd16<T>::splat(m);
-    return  simd16x32<bool>(
-      this->chunks[0] <= mask,
-      this->chunks[1] <= mask,
-      this->chunks[2] <= mask,
-      this->chunks[3] <= mask
-    ).to_bitmask();
-  }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
-
-      return  simd16x32<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
-      return  simd16x32<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-
-  }; // struct simd16x32<T>
-  template<>
-  simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const {
-      const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
-      const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
-      simd16x32<uint16_t> x(
+        // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
+        uint8x16_t sum0 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[0] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[1] & vreinterpretq_u16_u8(bit_mask)));
+        uint8x16_t sum1 = vpaddq_u8(vreinterpretq_u8_u16(this->chunks[2] & vreinterpretq_u16_u8(bit_mask)), vreinterpretq_u8_u16(this->chunks[3] & vreinterpretq_u16_u8(bit_mask)));
+        sum0 = vpaddq_u8(sum0, sum1);
+        sum0 = vpaddq_u8(sum0, sum0);
+        return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+    }
+
+    simdutf_really_inline void swap_bytes()
+    {
+        this->chunks[0] = this->chunks[0].swap_bytes();
+        this->chunks[1] = this->chunks[1].swap_bytes();
+        this->chunks[2] = this->chunks[2].swap_bytes();
+        this->chunks[3] = this->chunks[3].swap_bytes();
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
+
+        return simd16x32<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
+        return simd16x32<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+
+}; // struct simd16x32<T>
+template<>
+simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(const uint16_t low, const uint16_t high) const
+{
+    const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+    const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+    simd16x32<uint16_t> x(
         simd16<uint16_t>((this->chunks[0] > mask_high) | (this->chunks[0] < mask_low)),
         simd16<uint16_t>((this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)),
         simd16<uint16_t>((this->chunks[2] > mask_high) | (this->chunks[2] < mask_low)),
-        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
-      );
-      return  x.to_bitmask();
-    }
+        simd16<uint16_t>((this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)));
+    return x.to_bitmask();
+}
 /* end file src/simdutf/arm64/simd16-inl.h */
 } // namespace simd
 } // unnamed namespace
@@ -1097,7 +1319,7 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
 #endif // SIMDUTF_ARM64_SIMD_H
 /* end file src/simdutf/arm64/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/end.h
 /* begin file src/simdutf/arm64/end.h */
 /* end file src/simdutf/arm64/end.h */
 
@@ -1105,13 +1327,11 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
 
 #endif // SIMDUTF_ARM64_H
 /* end file src/simdutf/arm64.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake.h
 /* begin file src/simdutf/icelake.h */
 #ifndef SIMDUTF_ICELAKE_H
 #define SIMDUTF_ICELAKE_H
 
-
-
 #ifdef __has_include
 // How do we detect that a compiler supports vbmi2?
 // For sure if the following header is found, we are ok?
@@ -1133,18 +1353,15 @@ simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { retur
 #define SIMDUTF_IMPLEMENTATION_ICELAKE ((SIMDUTF_IS_X86_64) && (SIMDUTF_COMPILER_SUPPORTS_VBMI2))
 #endif
 
-// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
 // https://github.com/simdutf/simdutf/issues/1247
-#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && \
-                                         SIMDUTF_HAS_AVX512DQ && \
-                                         SIMDUTF_HAS_AVX512VL && \
-                                           SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
+#define SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ((SIMDUTF_IMPLEMENTATION_ICELAKE) && (SIMDUTF_IS_X86_64) && (__AVX2__) && (SIMDUTF_HAS_AVX512F && SIMDUTF_HAS_AVX512DQ && SIMDUTF_HAS_AVX512VL && SIMDUTF_HAS_AVX512VBMI2) && (!SIMDUTF_IS_32BITS))
 
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 #define SIMDUTF_TARGET_ICELAKE
 #else
-#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt")
+#define SIMDUTF_TARGET_ICELAKE SIMDUTF_TARGET_REGION("avx512f,avx512dq,avx512cd,avx512bw,avx512vbmi,avx512vbmi2,avx512vl,avx2,bmi,bmi2,pclmul,lzcnt,popcnt")
 #endif
 
 namespace simdutf {
@@ -1152,20 +1369,17 @@ namespace icelake {
 } // namespace icelake
 } // namespace simdutf
 
-
-
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/intrinsics.h
 /* begin file src/simdutf/icelake/intrinsics.h */
 #ifndef SIMDUTF_ICELAKE_INTRINSICS_H
 #define SIMDUTF_ICELAKE_INTRINSICS_H
 
-
 #ifdef SIMDUTF_VISUAL_STUDIO
 // under clang within visual studio, this will include <x86intrin.h>
-#include <intrin.h>  // visual studio or clang
+#include <intrin.h> // visual studio or clang
 #include <immintrin.h>
 #else
 
@@ -1179,7 +1393,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
-
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -1209,15 +1422,14 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * <x86intrin.h>  (or <intrin.h>) before, so the headers
  * are fooled.
  */
-#include <bmiintrin.h>   // for _blsr_u64
-#include <bmi2intrin.h>  // for _pext_u64, _pdep_u64
+#include <bmiintrin.h> // for _blsr_u64
+#include <bmi2intrin.h> // for _pext_u64, _pdep_u64
 #include <lzcntintrin.h> // for  __lzcnt64
-#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
 #include <smmintrin.h>
 #include <tmmintrin.h>
 #include <avxintrin.h>
 #include <avx2intrin.h>
-#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
 // Important: we need the AVX-512 headers:
 #include <avx512fintrin.h>
 #include <avx512dqintrin.h>
@@ -1235,8 +1447,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif //  _blsr_u64
 #endif // SIMDUTF_CLANG_VISUAL_STUDIO
 
-
-
 #if defined(__GNUC__) && !defined(__clang__)
 
 #if __GNUC__ == 8
@@ -1253,27 +1463,27 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /**
  * GCC 8 fails to provide _mm512_set_epi8. We roll our own.
  */
-inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63) {
-  return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
-                          uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
-                          uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
-                          uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
-                          uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
-                          uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
-                          uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
-                          uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
+inline __m512i _mm512_set_epi8(uint8_t a0, uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8, uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16, uint8_t a17, uint8_t a18, uint8_t a19, uint8_t a20, uint8_t a21, uint8_t a22, uint8_t a23, uint8_t a24, uint8_t a25, uint8_t a26, uint8_t a27, uint8_t a28, uint8_t a29, uint8_t a30, uint8_t a31, uint8_t a32, uint8_t a33, uint8_t a34, uint8_t a35, uint8_t a36, uint8_t a37, uint8_t a38, uint8_t a39, uint8_t a40, uint8_t a41, uint8_t a42, uint8_t a43, uint8_t a44, uint8_t a45, uint8_t a46, uint8_t a47, uint8_t a48, uint8_t a49, uint8_t a50, uint8_t a51, uint8_t a52, uint8_t a53, uint8_t a54, uint8_t a55, uint8_t a56, uint8_t a57, uint8_t a58, uint8_t a59, uint8_t a60, uint8_t a61, uint8_t a62, uint8_t a63)
+{
+    return _mm512_set_epi64(uint64_t(a7) + (uint64_t(a6) << 8) + (uint64_t(a5) << 16) + (uint64_t(a4) << 24) + (uint64_t(a3) << 32) + (uint64_t(a2) << 40) + (uint64_t(a1) << 48) + (uint64_t(a0) << 56),
+        uint64_t(a15) + (uint64_t(a14) << 8) + (uint64_t(a13) << 16) + (uint64_t(a12) << 24) + (uint64_t(a11) << 32) + (uint64_t(a10) << 40) + (uint64_t(a9) << 48) + (uint64_t(a8) << 56),
+        uint64_t(a23) + (uint64_t(a22) << 8) + (uint64_t(a21) << 16) + (uint64_t(a20) << 24) + (uint64_t(a19) << 32) + (uint64_t(a18) << 40) + (uint64_t(a17) << 48) + (uint64_t(a16) << 56),
+        uint64_t(a31) + (uint64_t(a30) << 8) + (uint64_t(a29) << 16) + (uint64_t(a28) << 24) + (uint64_t(a27) << 32) + (uint64_t(a26) << 40) + (uint64_t(a25) << 48) + (uint64_t(a24) << 56),
+        uint64_t(a39) + (uint64_t(a38) << 8) + (uint64_t(a37) << 16) + (uint64_t(a36) << 24) + (uint64_t(a35) << 32) + (uint64_t(a34) << 40) + (uint64_t(a33) << 48) + (uint64_t(a32) << 56),
+        uint64_t(a47) + (uint64_t(a46) << 8) + (uint64_t(a45) << 16) + (uint64_t(a44) << 24) + (uint64_t(a43) << 32) + (uint64_t(a42) << 40) + (uint64_t(a41) << 48) + (uint64_t(a40) << 56),
+        uint64_t(a55) + (uint64_t(a54) << 8) + (uint64_t(a53) << 16) + (uint64_t(a52) << 24) + (uint64_t(a51) << 32) + (uint64_t(a50) << 40) + (uint64_t(a49) << 48) + (uint64_t(a48) << 56),
+        uint64_t(a63) + (uint64_t(a62) << 8) + (uint64_t(a61) << 16) + (uint64_t(a60) << 24) + (uint64_t(a59) << 32) + (uint64_t(a58) << 40) + (uint64_t(a57) << 48) + (uint64_t(a56) << 56));
 }
 #pragma GCC pop_options
 #endif // SIMDUTF_GCC8
 
 #endif // SIMDUTF_HASWELL_INTRINSICS_H
 /* end file src/simdutf/icelake/intrinsics.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/implementation.h
 /* begin file src/simdutf/icelake/implementation.h */
 #ifndef SIMDUTF_ICELAKE_IMPLEMENTATION_H
 #define SIMDUTF_ICELAKE_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace icelake {
 
@@ -1283,63 +1493,88 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation(
-      "icelake",
-      "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
-      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2 ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation(
+            "icelake",
+            "Intel AVX512 (AVX-512BW, AVX-512CD, AVX-512VL, AVX-512VBMI2 extensions)",
+            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2 | internal::instruction_set::AVX512BW | internal::instruction_set::AVX512CD | internal::instruction_set::AVX512VL | internal::instruction_set::AVX512VBMI2)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace icelake
@@ -1351,7 +1586,7 @@ public:
 //
 // The rest need to be inside the region
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/begin.h
 /* begin file src/simdutf/icelake/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
 // #define SIMDUTF_IMPLEMENTATION icelake
@@ -1363,11 +1598,11 @@ SIMDUTF_TARGET_ICELAKE
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/icelake/begin.h */
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/bitmanipulation.h
 /* begin file src/simdutf/icelake/bitmanipulation.h */
 #ifndef SIMDUTF_ICELAKE_BITMANIPULATION_H
 #define SIMDUTF_ICELAKE_BITMANIPULATION_H
@@ -1377,13 +1612,15 @@ namespace icelake {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num) {
-  return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num)
+{
+    return _popcnt64(input_num);
 }
 #endif
 
@@ -1393,7 +1630,7 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_ICELAKE_BITMANIPULATION_H
 /* end file src/simdutf/icelake/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/end.h
 /* begin file src/simdutf/icelake/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 // nothing needed.
@@ -1401,18 +1638,15 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
 /* end file src/simdutf/icelake/end.h */
 
-
-
 #endif // SIMDUTF_IMPLEMENTATION_ICELAKE
 #endif // SIMDUTF_ICELAKE_H
 /* end file src/simdutf/icelake.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell.h
 /* begin file src/simdutf/haswell.h */
 #ifndef SIMDUTF_HASWELL_H
 #define SIMDUTF_HASWELL_H
@@ -1424,7 +1658,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #error "haswell.h must be included before fallback.h"
 #endif
 
-
 // Default Haswell to on if this is x86-64. Even if we're not compiled for it, it could be selected
 // at runtime.
 #ifndef SIMDUTF_IMPLEMENTATION_HASWELL
@@ -1439,13 +1672,13 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif
 
 #endif
-// To see why  (__BMI__) && (__PCLMUL__) && (__LZCNT__) are not part of this next line, see
+// To see why  (__BMI__) && (__LZCNT__) are not part of this next line, see
 // https://github.com/simdutf/simdutf/issues/1247
 #define SIMDUTF_CAN_ALWAYS_RUN_HASWELL ((SIMDUTF_IMPLEMENTATION_HASWELL) && (SIMDUTF_IS_X86_64) && (__AVX2__))
 
 #if SIMDUTF_IMPLEMENTATION_HASWELL
 
-#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,pclmul,lzcnt")
+#define SIMDUTF_TARGET_HASWELL SIMDUTF_TARGET_REGION("avx2,bmi,lzcnt,popcnt")
 
 namespace simdutf {
 /**
@@ -1458,12 +1691,11 @@ namespace haswell {
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/implementation.h
 /* begin file src/simdutf/haswell/implementation.h */
 #ifndef SIMDUTF_HASWELL_IMPLEMENTATION_H
 #define SIMDUTF_HASWELL_IMPLEMENTATION_H
 
-
 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
 namespace simdutf {
 namespace haswell {
@@ -1472,64 +1704,88 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation(
-      "haswell",
-      "Intel/AMD AVX2",
-      internal::instruction_set::AVX2 | internal::instruction_set::PCLMULQDQ | internal::instruction_set::BMI1 | internal::instruction_set::BMI2
-  ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation(
+            "haswell",
+            "Intel/AMD AVX2",
+            internal::instruction_set::AVX2 | internal::instruction_set::BMI1 | internal::instruction_set::BMI2)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace haswell
@@ -1537,15 +1793,14 @@ public:
 
 #endif // SIMDUTF_HASWELL_IMPLEMENTATION_H
 /* end file src/simdutf/haswell/implementation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/intrinsics.h
 /* begin file src/simdutf/haswell/intrinsics.h */
 #ifndef SIMDUTF_HASWELL_INTRINSICS_H
 #define SIMDUTF_HASWELL_INTRINSICS_H
 
-
 #ifdef SIMDUTF_VISUAL_STUDIO
 // under clang within visual studio, this will include <x86intrin.h>
-#include <intrin.h>  // visual studio or clang
+#include <intrin.h> // visual studio or clang
 #else
 
 #if SIMDUTF_GCC11ORMORE
@@ -1558,7 +1813,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
-
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -1585,14 +1839,13 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * <x86intrin.h>  (or <intrin.h>) before, so the headers
  * are fooled.
  */
-#include <bmiintrin.h>   // for _blsr_u64
+#include <bmiintrin.h> // for _blsr_u64
 #include <lzcntintrin.h> // for  __lzcnt64
-#include <immintrin.h>   // for most things (AVX2, AVX512, _popcnt64)
+#include <immintrin.h> // for most things (AVX2, AVX512, _popcnt64)
 #include <smmintrin.h>
 #include <tmmintrin.h>
 #include <avxintrin.h>
 #include <avx2intrin.h>
-#include <wmmintrin.h>   // for  _mm_clmulepi64_si128
 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
 // has it as a macro.
 #ifndef _blsr_u64
@@ -1607,7 +1860,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 //
 // The rest need to be inside the region
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/begin.h
 /* begin file src/simdutf/haswell/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
 // #define SIMDUTF_IMPLEMENTATION haswell
@@ -1619,11 +1872,11 @@ SIMDUTF_TARGET_HASWELL
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/haswell/begin.h */
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/bitmanipulation.h
 /* begin file src/simdutf/haswell/bitmanipulation.h */
 #ifndef SIMDUTF_HASWELL_BITMANIPULATION_H
 #define SIMDUTF_HASWELL_BITMANIPULATION_H
@@ -1633,13 +1886,15 @@ namespace haswell {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num) {
-  return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num)
+{
+    return _popcnt64(input_num);
 }
 #endif
 
@@ -1649,190 +1904,254 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_HASWELL_BITMANIPULATION_H
 /* end file src/simdutf/haswell/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/simd.h
 /* begin file src/simdutf/haswell/simd.h */
 #ifndef SIMDUTF_HASWELL_SIMD_H
 #define SIMDUTF_HASWELL_SIMD_H
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace simd {
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename Child>
-  struct base {
+// Forward-declared so they can be used by splat and friends.
+template<typename Child>
+struct base {
     __m256i value;
 
     // Zero constructor
-    simdutf_really_inline base() : value{__m256i()} {}
+    simdutf_really_inline base()
+        : value { __m256i() }
+    {
+    }
 
     // Conversion from SIMD register
-    simdutf_really_inline base(const __m256i _value) : value(_value) {}
+    simdutf_really_inline base(const __m256i _value)
+        : value(_value)
+    {
+    }
     // Conversion to SIMD register
     simdutf_really_inline operator const __m256i&() const { return this->value; }
     simdutf_really_inline operator __m256i&() { return this->value; }
-    template <endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
-      __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this,1));
-      if (big_endian) {
-        const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-        first = _mm256_shuffle_epi8(first, swap);
-        second = _mm256_shuffle_epi8(second, swap);
-      }
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), first);
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr+8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this,8))));
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this,1)));
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this,1),8)));
+    template<endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        __m256i first = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(*this));
+        __m256i second = _mm256_cvtepu8_epi16(_mm256_extractf128_si256(*this, 1));
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            first = _mm256_shuffle_epi8(first, swap);
+            second = _mm256_shuffle_epi8(second, swap);
+        }
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), first);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 16), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(*this)));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 8), _mm256_cvtepu8_epi32(_mm256_castsi256_si128(_mm256_srli_si256(*this, 8))));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 16), _mm256_cvtepu8_epi32(_mm256_extractf128_si256(*this, 1)));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr + 24), _mm256_cvtepu8_epi32(_mm_srli_si128(_mm256_extractf128_si256(*this, 1), 8)));
     }
     // Bit operations
     simdutf_really_inline Child operator|(const Child other) const { return _mm256_or_si256(*this, other); }
     simdutf_really_inline Child operator&(const Child other) const { return _mm256_and_si256(*this, other); }
     simdutf_really_inline Child operator^(const Child other) const { return _mm256_xor_si256(*this, other); }
     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm256_andnot_si256(other, *this); }
-    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
-  };
+    simdutf_really_inline Child& operator|=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator&=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator^=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
+};
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename T>
-  struct simd8;
+// Forward-declared so they can be used by splat and friends.
+template<typename T>
+struct simd8;
 
-  template<typename T, typename Mask=simd8<bool>>
-  struct base8: base<simd8<T>> {
+template<typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
     typedef uint32_t bitmask_t;
     typedef uint64_t bitmask2_t;
 
-    simdutf_really_inline base8() : base<simd8<T>>() {}
-    simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
-    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this,0); }
-    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this,31); }
-    simdutf_really_inline Mask operator==(const simd8<T> other) const { return _mm256_cmpeq_epi8(*this, other); }
+    simdutf_really_inline base8()
+        : base<simd8<T>>()
+    {
+    }
+    simdutf_really_inline base8(const __m256i _value)
+        : base<simd8<T>>(_value)
+    {
+    }
+    simdutf_really_inline T first() const { return _mm256_extract_epi8(*this, 0); }
+    simdutf_really_inline T last() const { return _mm256_extract_epi8(*this, 31); }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm256_cmpeq_epi8(lhs, rhs); }
 
     static const int SIZE = sizeof(base<T>::value);
 
-    template<int N=1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+    {
+        return _mm256_alignr_epi8(*this, _mm256_permute2x128_si256(prev_chunk, *this, 0x21), 16 - N);
     }
-  };
+};
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base8<bool> {
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd8<bool> : base8<bool> {
     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm256_set1_epi8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8<bool>() : base8() {}
-    simdutf_really_inline simd8<bool>(const __m256i _value) : base8<bool>(_value) {}
+    simdutf_really_inline simd8<bool>()
+        : base8()
+    {
+    }
+    simdutf_really_inline simd8<bool>(const __m256i _value)
+        : base8<bool>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+    simdutf_really_inline simd8<bool>(bool _value)
+        : base8<bool>(splat(_value))
+    {
+    }
 
     simdutf_really_inline uint32_t to_bitmask() const { return uint32_t(_mm256_movemask_epi8(*this)); }
     simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
     simdutf_really_inline bool none() const { return _mm256_testz_si256(*this, *this); }
     simdutf_really_inline bool all() const { return static_cast<uint32_t>(_mm256_movemask_epi8(*this)) == 0xFFFFFFFF; }
     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
-  };
+};
 
-  template<typename T>
-  struct base8_numeric: base8<T> {
+template<typename T>
+struct base8_numeric : base8<T> {
     static simdutf_really_inline simd8<T> splat(T _value) { return _mm256_set1_epi8(_value); }
     static simdutf_really_inline simd8<T> zero() { return _mm256_setzero_si256(); }
-    static simdutf_really_inline simd8<T> load(const T values[32]) {
-      return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
+    static simdutf_really_inline simd8<T> load(const T values[32])
+    {
+        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(values));
     }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     static simdutf_really_inline simd8<T> repeat_16(
-      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
-      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
-    ) {
-      return simd8<T>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+        T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+    {
+        return simd8<T>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
-    simdutf_really_inline base8_numeric() : base8<T>() {}
-    simdutf_really_inline base8_numeric(const __m256i _value) : base8<T>(_value) {}
+    simdutf_really_inline base8_numeric()
+        : base8<T>()
+    {
+    }
+    simdutf_really_inline base8_numeric(const __m256i _value)
+        : base8<T>(_value)
+    {
+    }
 
     // Store to array
-    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
+    simdutf_really_inline void store(T dst[32]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), *this); }
 
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm256_add_epi8(*this, other); }
     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm256_sub_epi8(*this, other); }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd8<T>*>(this);
+    }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd8<T>*>(this);
+    }
 
     // Override to distinguish from bool version
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return _mm256_shuffle_epi8(lookup_table, *this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return _mm256_shuffle_epi8(lookup_table, *this);
     }
 
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
-  };
-
-
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
-    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<int8_t>(_value) {}
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
+    }
+};
+
+// Signed bytes
+template<>
+struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<int8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m256i _value)
+        : base8_numeric<int8_t>(_value)
+    {
+    }
 
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const int8_t values[32])
+        : simd8(load(values))
+    {
+    }
     simdutf_really_inline operator simd8<uint8_t>() const;
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
-      int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
-      int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31
-    ) : simd8(_mm256_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15,
-      v16,v17,v18,v19,v20,v21,v22,v23,
-      v24,v25,v26,v27,v28,v29,v30,v31
-    )) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15,
+        int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+        int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31)
+        : simd8(_mm256_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v16, v17, v18, v19, v20, v21, v22, v23,
+            v24, v25, v26, v27, v28, v29, v30, v31))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
     simdutf_really_inline bool is_ascii() const { return _mm256_movemask_epi8(*this) == 0; }
     // Order-sensitive comparisons
@@ -1840,43 +2159,54 @@ namespace simd {
     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm256_min_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm256_cmpgt_epi8(other, *this); }
-  };
+};
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base8_numeric<uint8_t> {
-    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
-    simdutf_really_inline simd8(const __m256i _value) : base8_numeric<uint8_t>(_value) {}
+// Unsigned bytes
+template<>
+struct simd8<uint8_t> : base8_numeric<uint8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<uint8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m256i _value)
+        : base8_numeric<uint8_t>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint8_t values[32])
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
-      uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
-      uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31
-    ) : simd8(_mm256_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15,
-      v16,v17,v18,v19,v20,v21,v22,v23,
-      v24,v25,v26,v27,v28,v29,v30,v31
-    )) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20, uint8_t v21, uint8_t v22, uint8_t v23,
+        uint8_t v24, uint8_t v25, uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30, uint8_t v31)
+        : simd8(_mm256_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v16, v17, v18, v19, v20, v21, v22, v23,
+            v24, v25, v26, v27, v28, v29, v30, v31))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15,
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+    {
+        return simd8<uint8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15,
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
-
     // Saturated math
     simdutf_really_inline simd8<uint8_t> saturating_add(const simd8<uint8_t> other) const { return _mm256_adds_epu8(*this, other); }
     simdutf_really_inline simd8<uint8_t> saturating_sub(const simd8<uint8_t> other) const { return _mm256_subs_epu8(*this, other); }
@@ -1910,13 +2240,12 @@ namespace simd {
     // Get one of the bits and make a bitmask out of it.
     // e.g. value.get_bit<7>() gets the high bit
     template<int N>
-    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7-N)); }
-  };
-  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
-
+    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 7 - N)); }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
 
-  template<typename T>
-  struct simd8x64 {
+template<typename T>
+struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -1925,297 +2254,383 @@ namespace simd {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1) : chunks{chunk0, chunk1} {}
-    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T))} {}
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
+        : chunks { chunk0, chunk1 }
+    {
+    }
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)) }
+    {
+    }
 
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
-      uint64_t r_hi =                       this->chunks[1].to_bitmask();
-      return r_lo | (r_hi << 32);
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r_hi = this->chunks[1].to_bitmask();
+        return r_lo | (r_hi << 32);
     }
 
-    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      return *this;
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        return *this;
     }
 
-    simdutf_really_inline simd8<T> reduce_or() const {
-      return this->chunks[0] | this->chunks[1];
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return this->chunks[0] | this->chunks[1];
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
     }
 
-    template <endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
+    template<endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
     }
 
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
     }
 
-    simdutf_really_inline simd8x64<T> bit_or(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return simd8x64<T>(
-        this->chunks[0] | mask,
-        this->chunks[1] | mask
-      );
+    simdutf_really_inline simd8x64<T> bit_or(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<T>(
+            this->chunks[0] | mask,
+            this->chunks[1] | mask);
     }
 
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
-      return  simd8x64<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1]
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
+    {
+        return simd8x64<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1])
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
 
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-      return  simd8x64<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low)
-      ).to_bitmask();
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+        return simd8x64<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask)
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask)
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
-        (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask)
-      ).to_bitmask();
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            (simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+            (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
+            .to_bitmask();
     }
-  }; // struct simd8x64<T>
+}; // struct simd8x64<T>
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/simd16-inl.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/simd16-inl.h
 /* begin file src/simdutf/haswell/simd16-inl.h */
 #ifdef __GNUC__
 #if __GNUC__ < 8
 #define _mm256_set_m128i(xmm1, xmm2) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
-#define _mm256_setr_m128i(xmm2, xmm1)  _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
+#define _mm256_setr_m128i(xmm2, xmm1) _mm256_permute2f128_si256(_mm256_castsi128_si256(xmm1), _mm256_castsi128_si256(xmm2), 2)
 #endif
 #endif
 
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask=simd16<bool>>
-struct base16: base<simd16<T>> {
-  using bitmask_type = uint32_t;
-
-  simdutf_really_inline base16() : base<simd16<T>>() {}
-  simdutf_really_inline base16(const __m256i _value) : base<simd16<T>>(_value) {}
-  template <typename Pointer>
-  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr))) {}
+template<typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+    using bitmask_type = uint32_t;
 
-  simdutf_really_inline Mask operator==(const simd16<T> other) const { return _mm256_cmpeq_epi16(*this, other); }
+    simdutf_really_inline base16()
+        : base<simd16<T>>()
+    {
+    }
+    simdutf_really_inline base16(const __m256i _value)
+        : base<simd16<T>>(_value)
+    {
+    }
+    template<typename Pointer>
+    simdutf_really_inline base16(const Pointer* ptr)
+        : base16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)))
+    {
+    }
+    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm256_cmpeq_epi16(lhs, rhs); }
 
-  /// the size of vector in bytes
-  static const int SIZE = sizeof(base<simd16<T>>::value);
+    /// the size of vector in bytes
+    static const int SIZE = sizeof(base<simd16<T>>::value);
 
-  /// the number of elements of type T a vector can hold
-  static const int ELEMENTS = SIZE / sizeof(T);
+    /// the number of elements of type T a vector can hold
+    static const int ELEMENTS = SIZE / sizeof(T);
 
-  template<int N=1>
-  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-    return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
-  }
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return _mm256_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool>: base16<bool> {
-  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
+struct simd16<bool> : base16<bool> {
+    static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm256_set1_epi16(uint16_t(-(!!_value))); }
 
-  simdutf_really_inline simd16<bool>() : base16() {}
-  simdutf_really_inline simd16<bool>(const __m256i _value) : base16<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+    simdutf_really_inline simd16<bool>()
+        : base16()
+    {
+    }
+    simdutf_really_inline simd16<bool>(const __m256i _value)
+        : base16<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16<bool>(bool _value)
+        : base16<bool>(splat(_value))
+    {
+    }
 
-  simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
-  simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
-  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+    simdutf_really_inline bitmask_type to_bitmask() const { return _mm256_movemask_epi8(*this); }
+    simdutf_really_inline bool any() const { return !_mm256_testz_si256(*this, *this); }
+    simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
 };
 
 template<typename T>
-struct base16_numeric: base16<T> {
-  static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
-  static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
-  static simdutf_really_inline simd16<T> load(const T values[8]) {
-    return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(values));
-  }
-
-  simdutf_really_inline base16_numeric() : base16<T>() {}
-  simdutf_really_inline base16_numeric(const __m256i _value) : base16<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), *this); }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
-  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
-  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
-  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+struct base16_numeric : base16<T> {
+    static simdutf_really_inline simd16<T> splat(T _value) { return _mm256_set1_epi16(_value); }
+    static simdutf_really_inline simd16<T> zero() { return _mm256_setzero_si256(); }
+    static simdutf_really_inline simd16<T> load(const T values[8])
+    {
+        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(values));
+    }
+
+    simdutf_really_inline base16_numeric()
+        : base16<T>()
+    {
+    }
+    simdutf_really_inline base16_numeric(const __m256i _value)
+        : base16<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[8]) const { return _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm256_add_epi16(*this, other); }
+    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm256_sub_epi16(*this, other); }
+    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd16<T>*>(this);
+    }
+    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd16<T>*>(this);
+    }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
-  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<int16_t>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
-  // Order-sensitive comparisons
-  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
-  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
+    simdutf_really_inline simd16()
+        : base16_numeric<int16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m256i _value)
+        : base16_numeric<int16_t>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16(int16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const int16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const int16_t*>(values)))
+    {
+    }
+    // Order-sensitive comparisons
+    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm256_max_epi16(*this, other); }
+    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm256_min_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm256_cmpgt_epi16(other, *this); }
 };
 
 // Unsigned words
 template<>
-struct simd16<uint16_t>: base16_numeric<uint16_t>  {
-  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
-  simdutf_really_inline simd16(const __m256i _value) : base16_numeric<uint16_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
-
-  // Saturated math
-  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
-
-  // Order-specific operations
-  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-
-  // Bit-specific operations
-  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
-
-  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
-  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
-  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
-  // Get one of the bits and make a bitmask out of it.
-  // e.g. value.get_bit<7>() gets the high bit
-  template<int N>
-  simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15-N)); }
-
-  // Change the endianness
-  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
-    const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-    return _mm256_shuffle_epi8(*this, swap);
-  }
-
-  // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
-  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
-    // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
-    //       we have to shuffle lanes in order to produce bytes in the
-    //       correct order.
-
-    // get the 0th lanes
-    const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
-    const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
-
-    // get the 1st lanes
-    const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
-    const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
-
-    // build new vectors (shuffle lanes)
-    const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
-    const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
-
-    // pack words in linear order from v0 and v1
-    return _mm256_packus_epi16(t0, t1);
-  }
-};
+struct simd16<uint16_t> : base16_numeric<uint16_t> {
+    simdutf_really_inline simd16()
+        : base16_numeric<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m256i _value)
+        : base16_numeric<uint16_t>(_value)
+    {
+    }
+
+    // Splat constructor
+    simdutf_really_inline simd16(uint16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const uint16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
+    {
+    }
+
+    // Saturated math
+    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm256_adds_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm256_subs_epu16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm256_max_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm256_min_epu16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+    simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm256_testz_si256(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm256_testz_si256(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm256_srli_epi16(*this, N)); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm256_slli_epi16(*this, N)); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdutf_really_inline int get_bit() const { return _mm256_movemask_epi8(_mm256_slli_epi16(*this, 15 - N)); }
+
+    // Change the endianness
+    simdutf_really_inline simd16<uint16_t> swap_bytes() const
+    {
+        const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+            17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+        return _mm256_shuffle_epi8(*this, swap);
+    }
+
+    // Pack with the unsigned saturation two uint16_t words into single uint8_t vector
+    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
+    {
+        // Note: the AVX2 variant of pack operates on 128-bit lanes, thus
+        //       we have to shuffle lanes in order to produce bytes in the
+        //       correct order.
 
+        // get the 0th lanes
+        const __m128i lo_0 = _mm256_extracti128_si256(v0, 0);
+        const __m128i lo_1 = _mm256_extracti128_si256(v1, 0);
+
+        // get the 1st lanes
+        const __m128i hi_0 = _mm256_extracti128_si256(v0, 1);
+        const __m128i hi_1 = _mm256_extracti128_si256(v1, 1);
+
+        // build new vectors (shuffle lanes)
+        const __m256i t0 = _mm256_set_m128i(lo_1, lo_0);
+        const __m256i t1 = _mm256_set_m128i(hi_1, hi_0);
+
+        // pack words in linear order from v0 and v1
+        return _mm256_packus_epi16(t0, t1);
+    }
+};
 
-  template<typename T>
-  struct simd16x32 {
+template<typename T>
+struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 2, "Haswell kernel should use two registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -2224,96 +2639,114 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1) : chunks{chunk0, chunk1} {}
-    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T))} {}
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1)
+        : chunks { chunk0, chunk1 }
+    {
+    }
+    simdutf_really_inline simd16x32(const T* ptr)
+        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)) }
+    {
+    }
 
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
     }
 
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
-      uint64_t r_hi =                       this->chunks[1].to_bitmask();
-      return r_lo | (r_hi << 32);
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r_hi = this->chunks[1].to_bitmask();
+        return r_lo | (r_hi << 32);
     }
 
-    simdutf_really_inline simd16<T> reduce_or() const {
-      return this->chunks[0] | this->chunks[1];
+    simdutf_really_inline simd16<T> reduce_or() const
+    {
+        return this->chunks[0] | this->chunks[1];
     }
 
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
     }
 
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
-      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>));
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
     }
 
-    simdutf_really_inline simd16x32<T> bit_or(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return simd16x32<T>(
-        this->chunks[0] | mask,
-        this->chunks[1] | mask
-      );
+    simdutf_really_inline simd16x32<T> bit_or(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<T>(
+            this->chunks[0] | mask,
+            this->chunks[1] | mask);
     }
 
-    simdutf_really_inline void swap_bytes() {
-      this->chunks[0] = this->chunks[0].swap_bytes();
-      this->chunks[1] = this->chunks[1].swap_bytes();
+    simdutf_really_inline void swap_bytes()
+    {
+        this->chunks[0] = this->chunks[0].swap_bytes();
+        this->chunks[1] = this->chunks[1].swap_bytes();
     }
 
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
-      return  simd16x32<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1]
-      ).to_bitmask();
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t>& other) const
+    {
+        return simd16x32<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1])
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask)
+            .to_bitmask();
     }
 
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
 
-      return  simd16x32<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
-      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
-      return simd16x32<bool>(
-        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low)
-      ).to_bitmask();
+        return simd16x32<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
     }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask
-      ).to_bitmask();
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+        const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+        return simd16x32<bool>(
+            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask)
+            .to_bitmask();
     }
-  }; // struct simd16x32<T>
+}; // struct simd16x32<T>
 /* end file src/simdutf/haswell/simd16-inl.h */
 
 } // namespace simd
@@ -2325,7 +2758,7 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
 #endif // SIMDUTF_HASWELL_SIMD_H
 /* end file src/simdutf/haswell/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/end.h
 /* begin file src/simdutf/haswell/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
 // nothing needed.
@@ -2333,7 +2766,6 @@ struct simd16<uint16_t>: base16_numeric<uint16_t>  {
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -2342,7 +2774,7 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #endif // SIMDUTF_IMPLEMENTATION_HASWELL
 #endif // SIMDUTF_HASWELL_COMMON_H
 /* end file src/simdutf/haswell.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere.h
 /* begin file src/simdutf/westmere.h */
 #ifndef SIMDUTF_WESTMERE_H
 #define SIMDUTF_WESTMERE_H
@@ -2351,7 +2783,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 #error "westmere.h must be included before fallback.h"
 #endif
 
-
 // Default Westmere to on if this is x86-64, unless we'll always select Haswell.
 #ifndef SIMDUTF_IMPLEMENTATION_WESTMERE
 //
@@ -2366,11 +2797,11 @@ SIMDUTF_POP_DISABLE_WARNINGS
 
 #endif
 
-#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__ && __PCLMUL__)
+#define SIMDUTF_CAN_ALWAYS_RUN_WESTMERE (SIMDUTF_IMPLEMENTATION_WESTMERE && SIMDUTF_IS_X86_64 && __SSE4_2__)
 
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
 
-#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,pclmul")
+#define SIMDUTF_TARGET_WESTMERE SIMDUTF_TARGET_REGION("sse4.2,popcnt")
 
 namespace simdutf {
 /**
@@ -2383,12 +2814,11 @@ namespace westmere {
 //
 // These two need to be included outside SIMDUTF_TARGET_REGION
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/implementation.h
 /* begin file src/simdutf/westmere/implementation.h */
 #ifndef SIMDUTF_WESTMERE_IMPLEMENTATION_H
 #define SIMDUTF_WESTMERE_IMPLEMENTATION_H
 
-
 // The constructor may be executed on any host, so we take care not to use SIMDUTF_TARGET_REGION
 namespace simdutf {
 namespace westmere {
@@ -2399,60 +2829,85 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42 | internal::instruction_set::PCLMULQDQ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation("westmere", "Intel/AMD SSE4.2", internal::instruction_set::SSE42)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace westmere
@@ -2460,7 +2915,7 @@ public:
 
 #endif // SIMDUTF_WESTMERE_IMPLEMENTATION_H
 /* end file src/simdutf/westmere/implementation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/intrinsics.h
 /* begin file src/simdutf/westmere/intrinsics.h */
 #ifndef SIMDUTF_WESTMERE_INTRINSICS_H
 #define SIMDUTF_WESTMERE_INTRINSICS_H
@@ -2480,7 +2935,6 @@ SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
 
 #include <x86intrin.h> // elsewhere
 
-
 #if SIMDUTF_GCC11ORMORE
 // cancels the suppression of the -Wuninitialized
 SIMDUTF_POP_DISABLE_WARNINGS
@@ -2488,7 +2942,6 @@ SIMDUTF_POP_DISABLE_WARNINGS
 
 #endif // SIMDUTF_VISUAL_STUDIO
 
-
 #ifdef SIMDUTF_CLANG_VISUAL_STUDIO
 /**
  * You are not supposed, normally, to include these
@@ -2498,19 +2951,16 @@ SIMDUTF_POP_DISABLE_WARNINGS
  * only get included *if* the corresponding features are detected
  * from macros:
  */
-#include <smmintrin.h>  // for _mm_alignr_epi8
-#include <wmmintrin.h>  // for  _mm_clmulepi64_si128
+#include <smmintrin.h> // for _mm_alignr_epi8
 #endif
 
-
-
 #endif // SIMDUTF_WESTMERE_INTRINSICS_H
 /* end file src/simdutf/westmere/intrinsics.h */
 
 //
 // The rest need to be inside the region
 //
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/begin.h
 /* begin file src/simdutf/westmere/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
 // #define SIMDUTF_IMPLEMENTATION westmere
@@ -2523,7 +2973,7 @@ SIMDUTF_TARGET_WESTMERE
 /* end file src/simdutf/westmere/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/bitmanipulation.h
 /* begin file src/simdutf/westmere/bitmanipulation.h */
 #ifndef SIMDUTF_WESTMERE_BITMANIPULATION_H
 #define SIMDUTF_WESTMERE_BITMANIPULATION_H
@@ -2533,13 +2983,15 @@ namespace westmere {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num);// Visual Studio wants two underscores
+simdutf_really_inline unsigned __int64 count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline long long int count_ones(uint64_t input_num) {
-  return _popcnt64(input_num);
+simdutf_really_inline long long int count_ones(uint64_t input_num)
+{
+    return _popcnt64(input_num);
 }
 #endif
 
@@ -2549,7 +3001,7 @@ simdutf_really_inline long long int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_WESTMERE_BITMANIPULATION_H
 /* end file src/simdutf/westmere/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/simd.h
 /* begin file src/simdutf/westmere/simd.h */
 #ifndef SIMDUTF_WESTMERE_SIMD_H
 #define SIMDUTF_WESTMERE_SIMD_H
@@ -2559,110 +3011,155 @@ namespace westmere {
 namespace {
 namespace simd {
 
-  template<typename Child>
-  struct base {
+template<typename Child>
+struct base {
     __m128i value;
 
     // Zero constructor
-    simdutf_really_inline base() : value{__m128i()} {}
+    simdutf_really_inline base()
+        : value { __m128i() }
+    {
+    }
 
     // Conversion from SIMD register
-    simdutf_really_inline base(const __m128i _value) : value(_value) {}
+    simdutf_really_inline base(const __m128i _value)
+        : value(_value)
+    {
+    }
     // Conversion to SIMD register
     simdutf_really_inline operator const __m128i&() const { return this->value; }
     simdutf_really_inline operator __m128i&() { return this->value; }
-    template <endianness big_endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * p) const {
-      __m128i first = _mm_cvtepu8_epi16(*this);
-      __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this,8));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        first = _mm_shuffle_epi8(first, swap);
-        second = _mm_shuffle_epi8(second, swap);
-      }
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), first);
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), second);
-    }
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * p) const {
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p), _mm_cvtepu8_epi32(*this));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+4), _mm_cvtepu8_epi32(_mm_srli_si128(*this,4)));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+8), _mm_cvtepu8_epi32(_mm_srli_si128(*this,8)));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(p+12), _mm_cvtepu8_epi32(_mm_srli_si128(*this,12)));
+    template<endianness big_endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* p) const
+    {
+        __m128i first = _mm_cvtepu8_epi16(*this);
+        __m128i second = _mm_cvtepu8_epi16(_mm_srli_si128(*this, 8));
+        if (big_endian) {
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            first = _mm_shuffle_epi8(first, swap);
+            second = _mm_shuffle_epi8(second, swap);
+        }
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), first);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 8), second);
+    }
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* p) const
+    {
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p), _mm_cvtepu8_epi32(*this));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 4), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 4)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 8), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 8)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(p + 12), _mm_cvtepu8_epi32(_mm_srli_si128(*this, 12)));
     }
     // Bit operations
     simdutf_really_inline Child operator|(const Child other) const { return _mm_or_si128(*this, other); }
     simdutf_really_inline Child operator&(const Child other) const { return _mm_and_si128(*this, other); }
     simdutf_really_inline Child operator^(const Child other) const { return _mm_xor_si128(*this, other); }
     simdutf_really_inline Child bit_andnot(const Child other) const { return _mm_andnot_si128(other, *this); }
-    simdutf_really_inline Child& operator|=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast | other; return *this_cast; }
-    simdutf_really_inline Child& operator&=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast & other; return *this_cast; }
-    simdutf_really_inline Child& operator^=(const Child other) { auto this_cast = static_cast<Child*>(this); *this_cast = *this_cast ^ other; return *this_cast; }
-  };
+    simdutf_really_inline Child& operator|=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator&=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator^=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
+};
 
-  // Forward-declared so they can be used by splat and friends.
-  template<typename T>
-  struct simd8;
+// Forward-declared so they can be used by splat and friends.
+template<typename T>
+struct simd8;
 
-  template<typename T, typename Mask=simd8<bool>>
-  struct base8: base<simd8<T>> {
+template<typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
     typedef uint16_t bitmask_t;
     typedef uint32_t bitmask2_t;
 
-    simdutf_really_inline T first() const { return _mm_extract_epi8(*this,0); }
-    simdutf_really_inline T last() const { return _mm_extract_epi8(*this,15); }
-    simdutf_really_inline base8() : base<simd8<T>>() {}
-    simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+    simdutf_really_inline T first() const { return _mm_extract_epi8(*this, 0); }
+    simdutf_really_inline T last() const { return _mm_extract_epi8(*this, 15); }
+    simdutf_really_inline base8()
+        : base<simd8<T>>()
+    {
+    }
+    simdutf_really_inline base8(const __m128i _value)
+        : base<simd8<T>>(_value)
+    {
+    }
 
-    simdutf_really_inline Mask operator==(const simd8<T> other) const { return _mm_cmpeq_epi8(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs) { return _mm_cmpeq_epi8(lhs, rhs); }
 
     static const int SIZE = sizeof(base<simd8<T>>::value);
 
-    template<int N=1>
-    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
-      return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const
+    {
+        return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
     }
-  };
+};
 
-  // SIMD byte mask type (returned by things like eq and gt)
-  template<>
-  struct simd8<bool>: base8<bool> {
+// SIMD byte mask type (returned by things like eq and gt)
+template<>
+struct simd8<bool> : base8<bool> {
     static simdutf_really_inline simd8<bool> splat(bool _value) { return _mm_set1_epi8(uint8_t(-(!!_value))); }
 
-    simdutf_really_inline simd8<bool>() : base8() {}
-    simdutf_really_inline simd8<bool>(const __m128i _value) : base8<bool>(_value) {}
+    simdutf_really_inline simd8<bool>()
+        : base8()
+    {
+    }
+    simdutf_really_inline simd8<bool>(const __m128i _value)
+        : base8<bool>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8<bool>(bool _value) : base8<bool>(splat(_value)) {}
+    simdutf_really_inline simd8<bool>(bool _value)
+        : base8<bool>(splat(_value))
+    {
+    }
 
     simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
     simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
     simdutf_really_inline bool none() const { return _mm_testz_si128(*this, *this); }
     simdutf_really_inline bool all() const { return _mm_movemask_epi8(*this) == 0xFFFF; }
     simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
-  };
+};
 
-  template<typename T>
-  struct base8_numeric: base8<T> {
+template<typename T>
+struct base8_numeric : base8<T> {
     static simdutf_really_inline simd8<T> splat(T _value) { return _mm_set1_epi8(_value); }
     static simdutf_really_inline simd8<T> zero() { return _mm_setzero_si128(); }
-    static simdutf_really_inline simd8<T> load(const T values[16]) {
-      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    static simdutf_really_inline simd8<T> load(const T values[16])
+    {
+        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
     }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     static simdutf_really_inline simd8<T> repeat_16(
-      T v0,  T v1,  T v2,  T v3,  T v4,  T v5,  T v6,  T v7,
-      T v8,  T v9,  T v10, T v11, T v12, T v13, T v14, T v15
-    ) {
-      return simd8<T>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+        T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15)
+    {
+        return simd8<T>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
-    simdutf_really_inline base8_numeric() : base8<T>() {}
-    simdutf_really_inline base8_numeric(const __m128i _value) : base8<T>(_value) {}
+    simdutf_really_inline base8_numeric()
+        : base8<T>()
+    {
+    }
+    simdutf_really_inline base8_numeric(const __m128i _value)
+        : base8<T>(_value)
+    {
+    }
 
     // Store to array
-    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
+    simdutf_really_inline void store(T dst[16]) const { return _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), *this); }
 
     // Override to distinguish from bool version
     simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
@@ -2670,56 +3167,77 @@ namespace simd {
     // Addition/subtraction are the same for signed and unsigned
     simdutf_really_inline simd8<T> operator+(const simd8<T> other) const { return _mm_add_epi8(*this, other); }
     simdutf_really_inline simd8<T> operator-(const simd8<T> other) const { return _mm_sub_epi8(*this, other); }
-    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other) { *this = *this + other; return *static_cast<simd8<T>*>(this); }
-    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other) { *this = *this - other; return *static_cast<simd8<T>*>(this); }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd8<T>*>(this);
+    }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd8<T>*>(this);
+    }
 
     // Perform a lookup assuming the value is between 0 and 16 (undefined behavior for out of range values)
     template<typename L>
-    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-      return _mm_shuffle_epi8(lookup_table, *this);
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return _mm_shuffle_epi8(lookup_table, *this);
     }
 
     template<typename L>
     simdutf_really_inline simd8<L> lookup_16(
-        L replace0,  L replace1,  L replace2,  L replace3,
-        L replace4,  L replace5,  L replace6,  L replace7,
-        L replace8,  L replace9,  L replace10, L replace11,
-        L replace12, L replace13, L replace14, L replace15) const {
-      return lookup_16(simd8<L>::repeat_16(
-        replace0,  replace1,  replace2,  replace3,
-        replace4,  replace5,  replace6,  replace7,
-        replace8,  replace9,  replace10, replace11,
-        replace12, replace13, replace14, replace15
-      ));
-    }
-  };
-
-  // Signed bytes
-  template<>
-  struct simd8<int8_t> : base8_numeric<int8_t> {
-    simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
-    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<int8_t>(_value) {}
+        L replace0, L replace1, L replace2, L replace3,
+        L replace4, L replace5, L replace6, L replace7,
+        L replace8, L replace9, L replace10, L replace11,
+        L replace12, L replace13, L replace14, L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3,
+            replace4, replace5, replace6, replace7,
+            replace8, replace9, replace10, replace11,
+            replace12, replace13, replace14, replace15));
+    }
+};
+
+// Signed bytes
+template<>
+struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<int8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<int8_t>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const int8_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const int8_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) : simd8(_mm_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8(_mm_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<int8_t> repeat_16(
-      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
-      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
-    ) {
-      return simd8<int8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
     simdutf_really_inline operator simd8<uint8_t>() const;
     simdutf_really_inline bool is_ascii() const { return _mm_movemask_epi8(*this) == 0; }
@@ -2729,35 +3247,47 @@ namespace simd {
     simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const { return _mm_min_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(*this, other); }
     simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const { return _mm_cmpgt_epi8(other, *this); }
-  };
+};
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint8_t>: base8_numeric<uint8_t>  {
-    simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
-    simdutf_really_inline simd8(const __m128i _value) : base8_numeric<uint8_t>(_value) {}
+// Unsigned bytes
+template<>
+struct simd8<uint8_t> : base8_numeric<uint8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<uint8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<uint8_t>(_value)
+    {
+    }
 
     // Splat constructor
-    simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint8_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint8_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) : simd8(_mm_setr_epi8(
-      v0, v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10,v11,v12,v13,v14,v15
-    )) {}
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+        : simd8(_mm_setr_epi8(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15))
+    {
+    }
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     simdutf_really_inline static simd8<uint8_t> repeat_16(
-      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
-      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
-    ) {
-      return simd8<uint8_t>(
-        v0, v1, v2, v3, v4, v5, v6, v7,
-        v8, v9, v10,v11,v12,v13,v14,v15
-      );
+        uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
+        uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+    {
+        return simd8<uint8_t>(
+            v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14, v15);
     }
 
     // Saturated math
@@ -2794,30 +3324,44 @@ namespace simd {
     // Get one of the bits and make a bitmask out of it.
     // e.g. value.get_bit<7>() gets the high bit
     template<int N>
-    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
-  };
-  simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
+    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const { return this->value; }
 
-  // Unsigned bytes
-  template<>
-  struct simd8<uint16_t>: base<uint16_t> {
+// Unsigned bytes
+template<>
+struct simd8<uint16_t> : base<uint16_t> {
     static simdutf_really_inline simd8<uint16_t> splat(uint16_t _value) { return _mm_set1_epi16(_value); }
-    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8]) {
-      return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
+    static simdutf_really_inline simd8<uint16_t> load(const uint16_t values[8])
+    {
+        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
     }
 
-    simdutf_really_inline simd8() : base<uint16_t>() {}
-    simdutf_really_inline simd8(const __m128i _value) : base<uint16_t>(_value) {}
+    simdutf_really_inline simd8()
+        : base<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base<uint16_t>(_value)
+    {
+    }
     // Splat constructor
-    simdutf_really_inline simd8(uint16_t _value) : simd8(splat(_value)) {}
+    simdutf_really_inline simd8(uint16_t _value)
+        : simd8(splat(_value))
+    {
+    }
     // Array constructor
-    simdutf_really_inline simd8(const uint16_t* values) : simd8(load(values)) {}
+    simdutf_really_inline simd8(const uint16_t* values)
+        : simd8(load(values))
+    {
+    }
     // Member-by-member initialization
     simdutf_really_inline simd8(
-      uint16_t v0,  uint16_t v1,  uint16_t v2,  uint16_t v3,  uint16_t v4,  uint16_t v5,  uint16_t v6,  uint16_t v7
-    ) : simd8(_mm_setr_epi16(
-      v0, v1, v2, v3, v4, v5, v6, v7
-    )) {}
+        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+        : simd8(_mm_setr_epi16(
+            v0, v1, v2, v3, v4, v5, v6, v7))
+    {
+    }
 
     // Saturated math
     simdutf_really_inline simd8<uint16_t> saturating_add(const simd8<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
@@ -2844,9 +3388,9 @@ namespace simd {
     simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
     simdutf_really_inline bool bits_not_set_anywhere(simd8<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
     simdutf_really_inline bool any_bits_set_anywhere(simd8<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-     };
-  template<typename T>
-  struct simd8x64 {
+};
+template<typename T>
+struct simd8x64 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
     simd8<T> chunks[NUM_CHUNKS];
@@ -2855,303 +3399,395 @@ namespace simd {
     simd8x64<T>& operator=(const simd8<T> other) = delete; // no assignment allowed
     simd8x64() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
-
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd8<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd8<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd8<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd8<T>)*3/sizeof(T));
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1, const simd8<T> chunk2, const simd8<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
+    {
+    }
+
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    }
+
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        this->chunks[2] |= other.chunks[2];
+        this->chunks[3] |= other.chunks[3];
+        return *this;
+    }
+
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
+    }
+
+    template<endianness endian>
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].template store_ascii_as_utf16<endian>(ptr + sizeof(simd8<T>) * 3);
+    }
+
+    simdutf_really_inline void store_ascii_as_utf32(char32_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+        this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
+        this->chunks[2].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 2);
+        this->chunks[3].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r1 = this->chunks[1].to_bitmask();
+        uint64_t r2 = this->chunks[2].to_bitmask();
+        uint64_t r3 = this->chunks[3].to_bitmask();
+        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
+    {
+        return simd8x64<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1],
+            this->chunks[2] == other.chunks[2],
+            this->chunks[3] == other.chunks[3])
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low - 1);
+        const simd8<T> mask_high = simd8<T>::splat(high + 1);
+        return simd8x64<bool>(
+            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+            (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+            (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask,
+            this->chunks[2] > mask,
+            this->chunks[3] > mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask,
+            this->chunks[2] >= mask,
+            this->chunks[3] >= mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
+            simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
+            simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
+            simd8<uint8_t>(__m128i(this->chunks[3])) >= mask)
+            .to_bitmask();
     }
+}; // struct simd8x64<T>
 
-    simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      this->chunks[2] |= other.chunks[2];
-      this->chunks[3] |= other.chunks[3];
-      return *this;
-    }
-
-    simdutf_really_inline simd8<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
-    }
-
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
-    }
-
-    template <endianness endian>
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].template store_ascii_as_utf16<endian>(ptr+sizeof(simd8<T>)*3);
-    }
-
-    simdutf_really_inline void store_ascii_as_utf32(char32_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*0);
-      this->chunks[1].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*1);
-      this->chunks[2].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*2);
-      this->chunks[3].store_ascii_as_utf32(ptr+sizeof(simd8<T>)*3);
-    }
-
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
-      uint64_t r1 =          this->chunks[1].to_bitmask() ;
-      uint64_t r2 =          this->chunks[2].to_bitmask() ;
-      uint64_t r3 =          this->chunks[3].to_bitmask() ;
-      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask,
-        this->chunks[2] == mask,
-        this->chunks[3] == mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
-      return  simd8x64<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1],
-        this->chunks[2] == other.chunks[2],
-        this->chunks[3] == other.chunks[3]
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask,
-        this->chunks[2] <= mask,
-        this->chunks[3] <= mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low-1);
-      const simd8<T> mask_high = simd8<T>::splat(high+1);
-      return simd8x64<bool>(
-        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
-        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
-        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask,
-        this->chunks[2] > mask,
-        this->chunks[3] > mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask,
-        this->chunks[2] >= mask,
-        this->chunks[3] >= mask
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        simd8<uint8_t>(__m128i(this->chunks[0])) >= mask,
-        simd8<uint8_t>(__m128i(this->chunks[1])) >= mask,
-        simd8<uint8_t>(__m128i(this->chunks[2])) >= mask,
-        simd8<uint8_t>(__m128i(this->chunks[3])) >= mask
-      ).to_bitmask();
-    }
-  }; // struct simd8x64<T>
-
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/simd16-inl.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/simd16-inl.h
 /* begin file src/simdutf/westmere/simd16-inl.h */
 template<typename T>
 struct simd16;
 
-template<typename T, typename Mask=simd16<bool>>
-struct base16: base<simd16<T>> {
-  typedef uint16_t bitmask_t;
-  typedef uint32_t bitmask2_t;
+template<typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
 
-  simdutf_really_inline base16() : base<simd16<T>>() {}
-  simdutf_really_inline base16(const __m128i _value) : base<simd16<T>>(_value) {}
-  template <typename Pointer>
-  simdutf_really_inline base16(const Pointer* ptr) : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr))) {}
+    simdutf_really_inline base16()
+        : base<simd16<T>>()
+    {
+    }
+    simdutf_really_inline base16(const __m128i _value)
+        : base<simd16<T>>(_value)
+    {
+    }
+    template<typename Pointer>
+    simdutf_really_inline base16(const Pointer* ptr)
+        : base16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)))
+    {
+    }
 
-  simdutf_really_inline Mask operator==(const simd16<T> other) const { return _mm_cmpeq_epi16(*this, other); }
+    friend simdutf_really_inline Mask operator==(const simd16<T> lhs, const simd16<T> rhs) { return _mm_cmpeq_epi16(lhs, rhs); }
 
-  static const int SIZE = sizeof(base<simd16<T>>::value);
+    static const int SIZE = sizeof(base<simd16<T>>::value);
 
-  template<int N=1>
-  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
-    return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
-  }
+    template<int N = 1>
+    simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const
+    {
+        return _mm_alignr_epi8(*this, prev_chunk, 16 - N);
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
 template<>
-struct simd16<bool>: base16<bool> {
-  static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
+struct simd16<bool> : base16<bool> {
+    static simdutf_really_inline simd16<bool> splat(bool _value) { return _mm_set1_epi16(uint16_t(-(!!_value))); }
 
-  simdutf_really_inline simd16<bool>() : base16() {}
-  simdutf_really_inline simd16<bool>(const __m128i _value) : base16<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16<bool>(bool _value) : base16<bool>(splat(_value)) {}
+    simdutf_really_inline simd16<bool>()
+        : base16()
+    {
+    }
+    simdutf_really_inline simd16<bool>(const __m128i _value)
+        : base16<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16<bool>(bool _value)
+        : base16<bool>(splat(_value))
+    {
+    }
 
-  simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
-  simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
-  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+    simdutf_really_inline int to_bitmask() const { return _mm_movemask_epi8(*this); }
+    simdutf_really_inline bool any() const { return !_mm_testz_si128(*this, *this); }
+    simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
 };
 
 template<typename T>
-struct base16_numeric: base16<T> {
-  static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
-  static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
-  static simdutf_really_inline simd16<T> load(const T values[8]) {
-    return _mm_loadu_si128(reinterpret_cast<const __m128i *>(values));
-  }
-
-  simdutf_really_inline base16_numeric() : base16<T>() {}
-  simdutf_really_inline base16_numeric(const __m128i _value) : base16<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), *this); }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
-  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
-  simdutf_really_inline simd16<T>& operator+=(const simd16<T> other) { *this = *this + other; return *static_cast<simd16<T>*>(this); }
-  simdutf_really_inline simd16<T>& operator-=(const simd16<T> other) { *this = *this - other; return *static_cast<simd16<T>*>(this); }
+struct base16_numeric : base16<T> {
+    static simdutf_really_inline simd16<T> splat(T _value) { return _mm_set1_epi16(_value); }
+    static simdutf_really_inline simd16<T> zero() { return _mm_setzero_si128(); }
+    static simdutf_really_inline simd16<T> load(const T values[8])
+    {
+        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(values));
+    }
+
+    simdutf_really_inline base16_numeric()
+        : base16<T>()
+    {
+    }
+    simdutf_really_inline base16_numeric(const __m128i _value)
+        : base16<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[8]) const { return _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), *this); }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd16<T> operator+(const simd16<T> other) const { return _mm_add_epi16(*this, other); }
+    simdutf_really_inline simd16<T> operator-(const simd16<T> other) const { return _mm_sub_epi16(*this, other); }
+    simdutf_really_inline simd16<T>& operator+=(const simd16<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd16<T>*>(this);
+    }
+    simdutf_really_inline simd16<T>& operator-=(const simd16<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd16<T>*>(this);
+    }
 };
 
 // Signed words
 template<>
 struct simd16<int16_t> : base16_numeric<int16_t> {
-  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
-  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<int16_t>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const int16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const int16_t*>(values))) {}
-  // Member-by-member initialization
-  simdutf_really_inline simd16(
-    int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
-    : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
-  simdutf_really_inline operator simd16<uint16_t>() const;
-
-  // Order-sensitive comparisons
-  simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
-  simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
+    simdutf_really_inline simd16()
+        : base16_numeric<int16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m128i _value)
+        : base16_numeric<int16_t>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd16(int16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const int16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const int16_t*>(values)))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline simd16(
+        int16_t v0, int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7)
+        : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7))
+    {
+    }
+    simdutf_really_inline operator simd16<uint16_t>() const;
+
+    // Order-sensitive comparisons
+    simdutf_really_inline simd16<int16_t> max_val(const simd16<int16_t> other) const { return _mm_max_epi16(*this, other); }
+    simdutf_really_inline simd16<int16_t> min_val(const simd16<int16_t> other) const { return _mm_min_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator>(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(*this, other); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<int16_t> other) const { return _mm_cmpgt_epi16(other, *this); }
 };
 
 // Unsigned words
 template<>
-struct simd16<uint16_t>: base16_numeric<uint16_t>  {
-  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
-  simdutf_really_inline simd16(const __m128i _value) : base16_numeric<uint16_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd16(const uint16_t* values) : simd16(load(values)) {}
-  simdutf_really_inline simd16(const char16_t* values) : simd16(load(reinterpret_cast<const uint16_t*>(values))) {}
-  // Member-by-member initialization
-  simdutf_really_inline simd16(
-    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
-  : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7)) {}
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  simdutf_really_inline static simd16<uint16_t> repeat_16(
-    uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7
-  ) {
-    return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
-  }
-
-  // Saturated math
-  simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
-
-  // Order-specific operations
-  simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
-  simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
-  simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
-  simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-  simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
-
-  // Bit-specific operations
-  simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
-  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
-  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
-
-  simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
-  simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
-  simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
-  simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
-  template<int N>
-  simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
-  // Get one of the bits and make a bitmask out of it.
-  // e.g. value.get_bit<7>() gets the high bit
-  template<int N>
-  simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7-N)); }
-
-  // Change the endianness
-  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
-    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    return _mm_shuffle_epi8(*this, swap);
-  }
+struct simd16<uint16_t> : base16_numeric<uint16_t> {
+    simdutf_really_inline simd16()
+        : base16_numeric<uint16_t>()
+    {
+    }
+    simdutf_really_inline simd16(const __m128i _value)
+        : base16_numeric<uint16_t>(_value)
+    {
+    }
 
-  // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
-  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1) {
-    return _mm_packus_epi16(v0, v1);
-  }
+    // Splat constructor
+    simdutf_really_inline simd16(uint16_t _value)
+        : simd16(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd16(const uint16_t* values)
+        : simd16(load(values))
+    {
+    }
+    simdutf_really_inline simd16(const char16_t* values)
+        : simd16(load(reinterpret_cast<const uint16_t*>(values)))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline simd16(
+        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+        : simd16(_mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7))
+    {
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd16<uint16_t> repeat_16(
+        uint16_t v0, uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, uint16_t v5, uint16_t v6, uint16_t v7)
+    {
+        return simd16<uint16_t>(v0, v1, v2, v3, v4, v5, v6, v7);
+    }
+
+    // Saturated math
+    simdutf_really_inline simd16<uint16_t> saturating_add(const simd16<uint16_t> other) const { return _mm_adds_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> saturating_sub(const simd16<uint16_t> other) const { return _mm_subs_epu16(*this, other); }
+
+    // Order-specific operations
+    simdutf_really_inline simd16<uint16_t> max_val(const simd16<uint16_t> other) const { return _mm_max_epu16(*this, other); }
+    simdutf_really_inline simd16<uint16_t> min_val(const simd16<uint16_t> other) const { return _mm_min_epu16(*this, other); }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> gt_bits(const simd16<uint16_t> other) const { return this->saturating_sub(other); }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd16<uint16_t> lt_bits(const simd16<uint16_t> other) const { return other.saturating_sub(*this); }
+    simdutf_really_inline simd16<bool> operator<=(const simd16<uint16_t> other) const { return other.max_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>=(const simd16<uint16_t> other) const { return other.min_val(*this) == other; }
+    simdutf_really_inline simd16<bool> operator>(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+    simdutf_really_inline simd16<bool> operator<(const simd16<uint16_t> other) const { return this->gt_bits(other).any_bits_set(); }
+
+    // Bit-specific operations
+    simdutf_really_inline simd16<bool> bits_not_set() const { return *this == uint16_t(0); }
+    simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const { return (*this & bits).bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set() const { return ~this->bits_not_set(); }
+    simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const { return ~this->bits_not_set(bits); }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const { return _mm_testz_si128(*this, *this); }
+    simdutf_really_inline bool any_bits_set_anywhere() const { return !bits_not_set_anywhere(); }
+    simdutf_really_inline bool bits_not_set_anywhere(simd16<uint16_t> bits) const { return _mm_testz_si128(*this, bits); }
+    simdutf_really_inline bool any_bits_set_anywhere(simd16<uint16_t> bits) const { return !bits_not_set_anywhere(bits); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shr() const { return simd16<uint16_t>(_mm_srli_epi16(*this, N)); }
+    template<int N>
+    simdutf_really_inline simd16<uint16_t> shl() const { return simd16<uint16_t>(_mm_slli_epi16(*this, N)); }
+    // Get one of the bits and make a bitmask out of it.
+    // e.g. value.get_bit<7>() gets the high bit
+    template<int N>
+    simdutf_really_inline int get_bit() const { return _mm_movemask_epi8(_mm_slli_epi16(*this, 7 - N)); }
+
+    // Change the endianness
+    simdutf_really_inline simd16<uint16_t> swap_bytes() const
+    {
+        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        return _mm_shuffle_epi8(*this, swap);
+    }
+
+    // Pack with the unsigned saturation  two uint16_t words into single uint8_t vector
+    static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t>& v0, const simd16<uint16_t>& v1)
+    {
+        return _mm_packus_epi16(v0, v1);
+    }
 };
 simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const { return this->value; }
 
 template<typename T>
-  struct simd16x32 {
+struct simd16x32 {
     static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
     static_assert(NUM_CHUNKS == 4, "Westmere kernel should use four registers per 64-byte block.");
     simd16<T> chunks[NUM_CHUNKS];
@@ -3160,106 +3796,124 @@ template<typename T>
     simd16x32<T>& operator=(const simd16<T> other) = delete; // no assignment allowed
     simd16x32() = delete; // no default constructor allowed
 
-    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3) : chunks{chunk0, chunk1, chunk2, chunk3} {}
-    simdutf_really_inline simd16x32(const T* ptr) : chunks{simd16<T>::load(ptr), simd16<T>::load(ptr+sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+2*sizeof(simd16<T>)/sizeof(T)), simd16<T>::load(ptr+3*sizeof(simd16<T>)/sizeof(T))} {}
-
-    simdutf_really_inline void store(T* ptr) const {
-      this->chunks[0].store(ptr+sizeof(simd16<T>)*0/sizeof(T));
-      this->chunks[1].store(ptr+sizeof(simd16<T>)*1/sizeof(T));
-      this->chunks[2].store(ptr+sizeof(simd16<T>)*2/sizeof(T));
-      this->chunks[3].store(ptr+sizeof(simd16<T>)*3/sizeof(T));
-    }
-
-    simdutf_really_inline simd16<T> reduce_or() const {
-      return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
-    }
-
-    simdutf_really_inline bool is_ascii() const {
-      return this->reduce_or().is_ascii();
-    }
-
-    simdutf_really_inline void store_ascii_as_utf16(char16_t * ptr) const {
-      this->chunks[0].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*0);
-      this->chunks[1].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*1);
-      this->chunks[2].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*2);
-      this->chunks[3].store_ascii_as_utf16(ptr+sizeof(simd16<T>)*3);
-    }
-
-    simdutf_really_inline uint64_t to_bitmask() const {
-      uint64_t r0 = uint32_t(this->chunks[0].to_bitmask() );
-      uint64_t r1 =          this->chunks[1].to_bitmask() ;
-      uint64_t r2 =          this->chunks[2].to_bitmask() ;
-      uint64_t r3 =          this->chunks[3].to_bitmask() ;
-      return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-    }
-
-    simdutf_really_inline void swap_bytes() {
-      this->chunks[0] = this->chunks[0].swap_bytes();
-      this->chunks[1] = this->chunks[1].swap_bytes();
-      this->chunks[2] = this->chunks[2].swap_bytes();
-      this->chunks[3] = this->chunks[3].swap_bytes();
-    }
-
-    simdutf_really_inline uint64_t eq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] == mask,
-        this->chunks[1] == mask,
-        this->chunks[2] == mask,
-        this->chunks[3] == mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
-      return  simd16x32<bool>(
-        this->chunks[0] == other.chunks[0],
-        this->chunks[1] == other.chunks[1],
-        this->chunks[2] == other.chunks[2],
-        this->chunks[3] == other.chunks[3]
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t lteq(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] <= mask,
-        this->chunks[1] <= mask,
-        this->chunks[2] <= mask,
-        this->chunks[3] <= mask
-      ).to_bitmask();
-    }
-
-    simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(low);
-      const simd16<T> mask_high = simd16<T>::splat(high);
-
-      return  simd16x32<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low-1));
-      const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high+1));
-      return simd16x32<bool>(
-        (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
-        (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
-        (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
-        (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low)
-      ).to_bitmask();
-    }
-    simdutf_really_inline uint64_t lt(const T m) const {
-      const simd16<T> mask = simd16<T>::splat(m);
-      return  simd16x32<bool>(
-        this->chunks[0] < mask,
-        this->chunks[1] < mask,
-        this->chunks[2] < mask,
-        this->chunks[3] < mask
-      ).to_bitmask();
-    }
-  }; // struct simd16x32<T>
+    simdutf_really_inline simd16x32(const simd16<T> chunk0, const simd16<T> chunk1, const simd16<T> chunk2, const simd16<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
+    simdutf_really_inline simd16x32(const T* ptr)
+        : chunks { simd16<T>::load(ptr), simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)), simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T)) }
+    {
+    }
+
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
+    }
+
+    simdutf_really_inline simd16<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
+    }
+
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->reduce_or().is_ascii();
+    }
+
+    simdutf_really_inline void store_ascii_as_utf16(char16_t* ptr) const
+    {
+        this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+        this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+        this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+        this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
+    }
+
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r1 = this->chunks[1].to_bitmask();
+        uint64_t r2 = this->chunks[2].to_bitmask();
+        uint64_t r3 = this->chunks[3].to_bitmask();
+        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
+
+    simdutf_really_inline void swap_bytes()
+    {
+        this->chunks[0] = this->chunks[0].swap_bytes();
+        this->chunks[1] = this->chunks[1].swap_bytes();
+        this->chunks[2] = this->chunks[2].swap_bytes();
+        this->chunks[3] = this->chunks[3].swap_bytes();
+    }
+
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] == mask,
+            this->chunks[1] == mask,
+            this->chunks[2] == mask,
+            this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd16x32<uint16_t>& other) const
+    {
+        return simd16x32<bool>(
+            this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1],
+            this->chunks[2] == other.chunks[2],
+            this->chunks[3] == other.chunks[3])
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] <= mask,
+            this->chunks[1] <= mask,
+            this->chunks[2] <= mask,
+            this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(low);
+        const simd16<T> mask_high = simd16<T>::splat(high);
+
+        return simd16x32<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+        const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+        return simd16x32<bool>(
+            (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+            (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low),
+            (this->chunks[2] >= mask_high) | (this->chunks[2] <= mask_low),
+            (this->chunks[3] >= mask_high) | (this->chunks[3] <= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd16<T> mask = simd16<T>::splat(m);
+        return simd16x32<bool>(
+            this->chunks[0] < mask,
+            this->chunks[1] < mask,
+            this->chunks[2] < mask,
+            this->chunks[3] < mask)
+            .to_bitmask();
+    }
+}; // struct simd16x32<T>
 /* end file src/simdutf/westmere/simd16-inl.h */
 
 } // namespace simd
@@ -3270,7 +3924,7 @@ template<typename T>
 #endif // SIMDUTF_WESTMERE_SIMD_INPUT_H
 /* end file src/simdutf/westmere/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/end.h
 /* begin file src/simdutf/westmere/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
@@ -3283,7 +3937,7 @@ SIMDUTF_UNTARGET_REGION
 #endif // SIMDUTF_IMPLEMENTATION_WESTMERE
 #endif // SIMDUTF_WESTMERE_COMMON_H
 /* end file src/simdutf/westmere.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64.h
 /* begin file src/simdutf/ppc64.h */
 #ifndef SIMDUTF_PPC64_H
 #define SIMDUTF_PPC64_H
@@ -3292,13 +3946,10 @@ SIMDUTF_UNTARGET_REGION
 #error "ppc64.h must be included before fallback.h"
 #endif
 
-
 #ifndef SIMDUTF_IMPLEMENTATION_PPC64
 #define SIMDUTF_IMPLEMENTATION_PPC64 (SIMDUTF_IS_PPC64)
 #endif
-#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64 && SIMDUTF_IS_PPC64
-
-
+#define SIMDUTF_CAN_ALWAYS_RUN_PPC64 SIMDUTF_IMPLEMENTATION_PPC64&& SIMDUTF_IS_PPC64
 
 #if SIMDUTF_IMPLEMENTATION_PPC64
 
@@ -3310,12 +3961,11 @@ namespace ppc64 {
 } // namespace ppc64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/implementation.h
 /* begin file src/simdutf/ppc64/implementation.h */
 #ifndef SIMDUTF_PPC64_IMPLEMENTATION_H
 #define SIMDUTF_PPC64_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace ppc64 {
 
@@ -3325,62 +3975,64 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation()
-      : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
-                                 internal::instruction_set::ALTIVEC) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation("ppc64", "PPC64 ALTIVEC",
+            internal::instruction_set::ALTIVEC)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
 };
 
 } // namespace ppc64
@@ -3389,19 +4041,18 @@ public:
 #endif // SIMDUTF_PPC64_IMPLEMENTATION_H
 /* end file src/simdutf/ppc64/implementation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/begin.h
 /* begin file src/simdutf/ppc64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
 // #define SIMDUTF_IMPLEMENTATION ppc64
 /* end file src/simdutf/ppc64/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/intrinsics.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/intrinsics.h
 /* begin file src/simdutf/ppc64/intrinsics.h */
 #ifndef SIMDUTF_PPC64_INTRINSICS_H
 #define SIMDUTF_PPC64_INTRINSICS_H
 
-
 // This should be the correct header whether
 // you use visual studio or other compilers.
 #include <altivec.h>
@@ -3417,7 +4068,7 @@ public:
 
 #endif //  SIMDUTF_PPC64_INTRINSICS_H
 /* end file src/simdutf/ppc64/intrinsics.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/bitmanipulation.h
 /* begin file src/simdutf/ppc64/bitmanipulation.h */
 #ifndef SIMDUTF_PPC64_BITMANIPULATION_H
 #define SIMDUTF_PPC64_BITMANIPULATION_H
@@ -3427,13 +4078,15 @@ namespace ppc64 {
 namespace {
 
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-simdutf_really_inline int count_ones(uint64_t input_num) {
-  // note: we do not support legacy 32-bit Windows
-  return __popcnt64(input_num); // Visual Studio wants two underscores
+simdutf_really_inline int count_ones(uint64_t input_num)
+{
+    // note: we do not support legacy 32-bit Windows
+    return __popcnt64(input_num); // Visual Studio wants two underscores
 }
 #else
-simdutf_really_inline int count_ones(uint64_t input_num) {
-  return __builtin_popcountll(input_num);
+simdutf_really_inline int count_ones(uint64_t input_num)
+{
+    return __builtin_popcountll(input_num);
 }
 #endif
 
@@ -3443,7 +4096,7 @@ simdutf_really_inline int count_ones(uint64_t input_num) {
 
 #endif // SIMDUTF_PPC64_BITMANIPULATION_H
 /* end file src/simdutf/ppc64/bitmanipulation.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/simd.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/simd.h
 /* begin file src/simdutf/ppc64/simd.h */
 #ifndef SIMDUTF_PPC64_SIMD_H
 #define SIMDUTF_PPC64_SIMD_H
@@ -3457,474 +4110,592 @@ namespace simd {
 
 using __m128i = __vector unsigned char;
 
-template <typename Child> struct base {
-  __m128i value;
-
-  // Zero constructor
-  simdutf_really_inline base() : value{__m128i()} {}
-
-  // Conversion from SIMD register
-  simdutf_really_inline base(const __m128i _value) : value(_value) {}
-
-  // Conversion to SIMD register
-  simdutf_really_inline operator const __m128i &() const {
-    return this->value;
-  }
-  simdutf_really_inline operator __m128i &() { return this->value; }
-
-  // Bit operations
-  simdutf_really_inline Child operator|(const Child other) const {
-    return vec_or(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child operator&(const Child other) const {
-    return vec_and(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child operator^(const Child other) const {
-    return vec_xor(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child bit_andnot(const Child other) const {
-    return vec_andc(this->value, (__m128i)other);
-  }
-  simdutf_really_inline Child &operator|=(const Child other) {
-    auto this_cast = static_cast<Child*>(this);
-    *this_cast = *this_cast | other;
-    return *this_cast;
-  }
-  simdutf_really_inline Child &operator&=(const Child other) {
-    auto this_cast = static_cast<Child*>(this);
-    *this_cast = *this_cast & other;
-    return *this_cast;
-  }
-  simdutf_really_inline Child &operator^=(const Child other) {
-    auto this_cast = static_cast<Child*>(this);
-    *this_cast = *this_cast ^ other;
-    return *this_cast;
-  }
+template<typename Child> struct base {
+    __m128i value;
+
+    // Zero constructor
+    simdutf_really_inline base()
+        : value { __m128i() }
+    {
+    }
+
+    // Conversion from SIMD register
+    simdutf_really_inline base(const __m128i _value)
+        : value(_value)
+    {
+    }
+
+    // Conversion to SIMD register
+    simdutf_really_inline operator const __m128i&() const
+    {
+        return this->value;
+    }
+    simdutf_really_inline operator __m128i&() { return this->value; }
+
+    // Bit operations
+    simdutf_really_inline Child operator|(const Child other) const
+    {
+        return vec_or(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child operator&(const Child other) const
+    {
+        return vec_and(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child operator^(const Child other) const
+    {
+        return vec_xor(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child bit_andnot(const Child other) const
+    {
+        return vec_andc(this->value, (__m128i)other);
+    }
+    simdutf_really_inline Child& operator|=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast | other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator&=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast & other;
+        return *this_cast;
+    }
+    simdutf_really_inline Child& operator^=(const Child other)
+    {
+        auto this_cast = static_cast<Child*>(this);
+        *this_cast = *this_cast ^ other;
+        return *this_cast;
+    }
 };
 
 // Forward-declared so they can be used by splat and friends.
-template <typename T> struct simd8;
+template<typename T> struct simd8;
 
-template <typename T, typename Mask = simd8<bool>>
+template<typename T, typename Mask = simd8<bool>>
 struct base8 : base<simd8<T>> {
-  typedef uint16_t bitmask_t;
-  typedef uint32_t bitmask2_t;
+    typedef uint16_t bitmask_t;
+    typedef uint32_t bitmask2_t;
 
-  simdutf_really_inline base8() : base<simd8<T>>() {}
-  simdutf_really_inline base8(const __m128i _value) : base<simd8<T>>(_value) {}
+    simdutf_really_inline base8()
+        : base<simd8<T>>()
+    {
+    }
+    simdutf_really_inline base8(const __m128i _value)
+        : base<simd8<T>>(_value)
+    {
+    }
 
-  simdutf_really_inline Mask operator==(const simd8<T> other) const {
-    return (__m128i)vec_cmpeq(this->value, (__m128i)other);
-  }
+    friend simdutf_really_inline Mask operator==(const simd8<T> lhs, const simd8<T> rhs)
+    {
+        return (__m128i)vec_cmpeq(lhs.value, (__m128i)rhs);
+    }
 
-  static const int SIZE = sizeof(base<simd8<T>>::value);
+    static const int SIZE = sizeof(base<simd8<T>>::value);
 
-  template <int N = 1>
-  simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const {
-    __m128i chunk = this->value;
+    template<int N = 1>
+    simdutf_really_inline simd8<T> prev(simd8<T> prev_chunk) const
+    {
+        __m128i chunk = this->value;
 #ifdef __LITTLE_ENDIAN__
-    chunk = (__m128i)vec_reve(this->value);
-    prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
+        chunk = (__m128i)vec_reve(this->value);
+        prev_chunk = (__m128i)vec_reve((__m128i)prev_chunk);
 #endif
-    chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
+        chunk = (__m128i)vec_sld((__m128i)prev_chunk, (__m128i)chunk, 16 - N);
 #ifdef __LITTLE_ENDIAN__
-    chunk = (__m128i)vec_reve((__m128i)chunk);
+        chunk = (__m128i)vec_reve((__m128i)chunk);
 #endif
-    return chunk;
-  }
+        return chunk;
+    }
 };
 
 // SIMD byte mask type (returned by things like eq and gt)
-template <> struct simd8<bool> : base8<bool> {
-  static simdutf_really_inline simd8<bool> splat(bool _value) {
-    return (__m128i)vec_splats((unsigned char)(-(!!_value)));
-  }
-
-  simdutf_really_inline simd8<bool>() : base8() {}
-  simdutf_really_inline simd8<bool>(const __m128i _value)
-      : base8<bool>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd8<bool>(bool _value)
-      : base8<bool>(splat(_value)) {}
-
-  simdutf_really_inline int to_bitmask() const {
-    __vector unsigned long long result;
-    const __m128i perm_mask = {0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
-                               0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
-
-    result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
-                                                       (__m128i)perm_mask));
+template<> struct simd8<bool> : base8<bool> {
+    static simdutf_really_inline simd8<bool> splat(bool _value)
+    {
+        return (__m128i)vec_splats((unsigned char)(-(!!_value)));
+    }
+
+    simdutf_really_inline simd8<bool>()
+        : base8()
+    {
+    }
+    simdutf_really_inline simd8<bool>(const __m128i _value)
+        : base8<bool>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd8<bool>(bool _value)
+        : base8<bool>(splat(_value))
+    {
+    }
+
+    simdutf_really_inline int to_bitmask() const
+    {
+        __vector unsigned long long result;
+        const __m128i perm_mask = { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+            0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 };
+
+        result = ((__vector unsigned long long)vec_vbpermq((__m128i)this->value,
+            (__m128i)perm_mask));
 #ifdef __LITTLE_ENDIAN__
-    return static_cast<int>(result[1]);
+        return static_cast<int>(result[1]);
 #else
-    return static_cast<int>(result[0]);
+        return static_cast<int>(result[0]);
 #endif
-  }
-  simdutf_really_inline bool any() const {
-    return !vec_all_eq(this->value, (__m128i)vec_splats(0));
-  }
-  simdutf_really_inline simd8<bool> operator~() const {
-    return this->value ^ (__m128i)splat(true);
-  }
+    }
+    simdutf_really_inline bool any() const
+    {
+        return !vec_all_eq(this->value, (__m128i)vec_splats(0));
+    }
+    simdutf_really_inline simd8<bool> operator~() const
+    {
+        return this->value ^ (__m128i)splat(true);
+    }
 };
 
-template <typename T> struct base8_numeric : base8<T> {
-  static simdutf_really_inline simd8<T> splat(T value) {
-    (void)value;
-    return (__m128i)vec_splats(value);
-  }
-  static simdutf_really_inline simd8<T> zero() { return splat(0); }
-  static simdutf_really_inline simd8<T> load(const T values[16]) {
-    return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t *>(values)));
-  }
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
-                                                   T v5, T v6, T v7, T v8, T v9,
-                                                   T v10, T v11, T v12, T v13,
-                                                   T v14, T v15) {
-    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-                    v14, v15);
-  }
-
-  simdutf_really_inline base8_numeric() : base8<T>() {}
-  simdutf_really_inline base8_numeric(const __m128i _value)
-      : base8<T>(_value) {}
-
-  // Store to array
-  simdutf_really_inline void store(T dst[16]) const {
-    vec_vsx_st(this->value, 0, reinterpret_cast<__m128i *>(dst));
-  }
-
-  // Override to distinguish from bool version
-  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
-
-  // Addition/subtraction are the same for signed and unsigned
-  simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
-    return (__m128i)((__m128i)this->value + (__m128i)other);
-  }
-  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
-    return (__m128i)((__m128i)this->value - (__m128i)other);
-  }
-  simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
-    *this = *this + other;
-    return *static_cast<simd8<T> *>(this);
-  }
-  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
-    *this = *this - other;
-    return *static_cast<simd8<T> *>(this);
-  }
-
-  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
-  // for out of range values)
-  template <typename L>
-  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
-    return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
-  }
-
-  template <typename L>
-  simdutf_really_inline simd8<L>
-  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
-            L replace5, L replace6, L replace7, L replace8, L replace9,
-            L replace10, L replace11, L replace12, L replace13, L replace14,
-            L replace15) const {
-    return lookup_16(simd8<L>::repeat_16(
-        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
-        replace7, replace8, replace9, replace10, replace11, replace12,
-        replace13, replace14, replace15));
-  }
+template<typename T> struct base8_numeric : base8<T> {
+    static simdutf_really_inline simd8<T> splat(T value)
+    {
+        (void)value;
+        return (__m128i)vec_splats(value);
+    }
+    static simdutf_really_inline simd8<T> zero() { return splat(0); }
+    static simdutf_really_inline simd8<T> load(const T values[16])
+    {
+        return (__m128i)(vec_vsx_ld(0, reinterpret_cast<const uint8_t*>(values)));
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+        T v5, T v6, T v7, T v8, T v9,
+        T v10, T v11, T v12, T v13,
+        T v14, T v15)
+    {
+        return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+            v14, v15);
+    }
+
+    simdutf_really_inline base8_numeric()
+        : base8<T>()
+    {
+    }
+    simdutf_really_inline base8_numeric(const __m128i _value)
+        : base8<T>(_value)
+    {
+    }
+
+    // Store to array
+    simdutf_really_inline void store(T dst[16]) const
+    {
+        vec_vsx_st(this->value, 0, reinterpret_cast<__m128i*>(dst));
+    }
+
+    // Override to distinguish from bool version
+    simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+
+    // Addition/subtraction are the same for signed and unsigned
+    simdutf_really_inline simd8<T> operator+(const simd8<T> other) const
+    {
+        return (__m128i)((__m128i)this->value + (__m128i)other);
+    }
+    simdutf_really_inline simd8<T> operator-(const simd8<T> other) const
+    {
+        return (__m128i)((__m128i)this->value - (__m128i)other);
+    }
+    simdutf_really_inline simd8<T>& operator+=(const simd8<T> other)
+    {
+        *this = *this + other;
+        return *static_cast<simd8<T>*>(this);
+    }
+    simdutf_really_inline simd8<T>& operator-=(const simd8<T> other)
+    {
+        *this = *this - other;
+        return *static_cast<simd8<T>*>(this);
+    }
+
+    // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+    // for out of range values)
+    template<typename L>
+    simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const
+    {
+        return (__m128i)vec_perm((__m128i)lookup_table, (__m128i)lookup_table, this->value);
+    }
+
+    template<typename L>
+    simdutf_really_inline simd8<L>
+    lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+        L replace5, L replace6, L replace7, L replace8, L replace9,
+        L replace10, L replace11, L replace12, L replace13, L replace14,
+        L replace15) const
+    {
+        return lookup_16(simd8<L>::repeat_16(
+            replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+            replace7, replace8, replace9, replace10, replace11, replace12,
+            replace13, replace14, replace15));
+    }
 };
 
 // Signed bytes
-template <> struct simd8<int8_t> : base8_numeric<int8_t> {
-  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
-  simdutf_really_inline simd8(const __m128i _value)
-      : base8_numeric<int8_t>(_value) {}
-
-  // Splat constructor
-  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
-  // Member-by-member initialization
-  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
-                               int8_t v4, int8_t v5, int8_t v6, int8_t v7,
-                               int8_t v8, int8_t v9, int8_t v10, int8_t v11,
-                               int8_t v12, int8_t v13, int8_t v14, int8_t v15)
-      : simd8((__m128i)(__vector signed char){v0, v1, v2, v3, v4, v5, v6, v7,
-                                              v8, v9, v10, v11, v12, v13, v14,
-                                              v15}) {}
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  simdutf_really_inline static simd8<int8_t>
-  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
-            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
-            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
-    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-                         v13, v14, v15);
-  }
-
-  // Order-sensitive comparisons
-  simdutf_really_inline simd8<int8_t>
-  max_val(const simd8<int8_t> other) const {
-    return (__m128i)vec_max((__vector signed char)this->value,
-                            (__vector signed char)(__m128i)other);
-  }
-  simdutf_really_inline simd8<int8_t>
-  min_val(const simd8<int8_t> other) const {
-    return (__m128i)vec_min((__vector signed char)this->value,
-                            (__vector signed char)(__m128i)other);
-  }
-  simdutf_really_inline simd8<bool>
-  operator>(const simd8<int8_t> other) const {
-    return (__m128i)vec_cmpgt((__vector signed char)this->value,
-                              (__vector signed char)(__m128i)other);
-  }
-  simdutf_really_inline simd8<bool>
-  operator<(const simd8<int8_t> other) const {
-    return (__m128i)vec_cmplt((__vector signed char)this->value,
-                              (__vector signed char)(__m128i)other);
-  }
+template<> struct simd8<int8_t> : base8_numeric<int8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<int8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<int8_t>(_value)
+    {
+    }
+
+    // Splat constructor
+    simdutf_really_inline simd8(int8_t _value)
+        : simd8(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd8(const int8_t* values)
+        : simd8(load(values))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+        int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+        int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+        int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+        : simd8((__m128i)(__vector signed char) { v0, v1, v2, v3, v4, v5, v6, v7,
+            v8, v9, v10, v11, v12, v13, v14,
+            v15 })
+    {
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<int8_t>
+    repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+        int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+        int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+    {
+        return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+            v13, v14, v15);
+    }
+
+    // Order-sensitive comparisons
+    simdutf_really_inline simd8<int8_t>
+    max_val(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_max((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
+    simdutf_really_inline simd8<int8_t>
+    min_val(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_min((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
+    simdutf_really_inline simd8<bool>
+    operator>(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_cmpgt((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
+    simdutf_really_inline simd8<bool>
+    operator<(const simd8<int8_t> other) const
+    {
+        return (__m128i)vec_cmplt((__vector signed char)this->value,
+            (__vector signed char)(__m128i)other);
+    }
 };
 
 // Unsigned bytes
-template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
-  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
-  simdutf_really_inline simd8(const __m128i _value)
-      : base8_numeric<uint8_t>(_value) {}
-  // Splat constructor
-  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
-  // Array constructor
-  simdutf_really_inline simd8(const uint8_t *values) : simd8(load(values)) {}
-  // Member-by-member initialization
-  simdutf_really_inline
-  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+template<> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+    simdutf_really_inline simd8()
+        : base8_numeric<uint8_t>()
+    {
+    }
+    simdutf_really_inline simd8(const __m128i _value)
+        : base8_numeric<uint8_t>(_value)
+    {
+    }
+    // Splat constructor
+    simdutf_really_inline simd8(uint8_t _value)
+        : simd8(splat(_value))
+    {
+    }
+    // Array constructor
+    simdutf_really_inline simd8(const uint8_t* values)
+        : simd8(load(values))
+    {
+    }
+    // Member-by-member initialization
+    simdutf_really_inline
+    simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
         uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
         uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
-      : simd8((__m128i){v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-                        v13, v14, v15}) {}
-  // Repeat 16 values as many times as necessary (usually for lookup tables)
-  simdutf_really_inline static simd8<uint8_t>
-  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
-            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
-            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
-            uint8_t v15) {
-    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-                          v13, v14, v15);
-  }
-
-  // Saturated math
-  simdutf_really_inline simd8<uint8_t>
-  saturating_add(const simd8<uint8_t> other) const {
-    return (__m128i)vec_adds(this->value, (__m128i)other);
-  }
-  simdutf_really_inline simd8<uint8_t>
-  saturating_sub(const simd8<uint8_t> other) const {
-    return (__m128i)vec_subs(this->value, (__m128i)other);
-  }
-
-  // Order-specific operations
-  simdutf_really_inline simd8<uint8_t>
-  max_val(const simd8<uint8_t> other) const {
-    return (__m128i)vec_max(this->value, (__m128i)other);
-  }
-  simdutf_really_inline simd8<uint8_t>
-  min_val(const simd8<uint8_t> other) const {
-    return (__m128i)vec_min(this->value, (__m128i)other);
-  }
-  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd8<uint8_t>
-  gt_bits(const simd8<uint8_t> other) const {
-    return this->saturating_sub(other);
-  }
-  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
-  simdutf_really_inline simd8<uint8_t>
-  lt_bits(const simd8<uint8_t> other) const {
-    return other.saturating_sub(*this);
-  }
-  simdutf_really_inline simd8<bool>
-  operator<=(const simd8<uint8_t> other) const {
-    return other.max_val(*this) == other;
-  }
-  simdutf_really_inline simd8<bool>
-  operator>=(const simd8<uint8_t> other) const {
-    return other.min_val(*this) == other;
-  }
-  simdutf_really_inline simd8<bool>
-  operator>(const simd8<uint8_t> other) const {
-    return this->gt_bits(other).any_bits_set();
-  }
-  simdutf_really_inline simd8<bool>
-  operator<(const simd8<uint8_t> other) const {
-    return this->gt_bits(other).any_bits_set();
-  }
-
-  // Bit-specific operations
-  simdutf_really_inline simd8<bool> bits_not_set() const {
-    return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
-  }
-  simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
-    return (*this & bits).bits_not_set();
-  }
-  simdutf_really_inline simd8<bool> any_bits_set() const {
-    return ~this->bits_not_set();
-  }
-  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
-    return ~this->bits_not_set(bits);
-  }
-
-  simdutf_really_inline bool is_ascii() const {
-      return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
-  }
-
-  simdutf_really_inline bool bits_not_set_anywhere() const {
-    return vec_all_eq(this->value, (__m128i)vec_splats(0));
-  }
-  simdutf_really_inline bool any_bits_set_anywhere() const {
-    return !bits_not_set_anywhere();
-  }
-  simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const {
-    return vec_all_eq(vec_and(this->value, (__m128i)bits),
-                      (__m128i)vec_splats(0));
-  }
-  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
-    return !bits_not_set_anywhere(bits);
-  }
-  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
-    return simd8<uint8_t>(
-        (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
-  }
-  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
-    return simd8<uint8_t>(
-        (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
-  }
+        : simd8((__m128i) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+            v13, v14, v15 })
+    {
+    }
+    // Repeat 16 values as many times as necessary (usually for lookup tables)
+    simdutf_really_inline static simd8<uint8_t>
+    repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+        uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+        uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+        uint8_t v15)
+    {
+        return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+            v13, v14, v15);
+    }
+
+    // Saturated math
+    simdutf_really_inline simd8<uint8_t>
+    saturating_add(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_adds(this->value, (__m128i)other);
+    }
+    simdutf_really_inline simd8<uint8_t>
+    saturating_sub(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_subs(this->value, (__m128i)other);
+    }
+
+    // Order-specific operations
+    simdutf_really_inline simd8<uint8_t>
+    max_val(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_max(this->value, (__m128i)other);
+    }
+    simdutf_really_inline simd8<uint8_t>
+    min_val(const simd8<uint8_t> other) const
+    {
+        return (__m128i)vec_min(this->value, (__m128i)other);
+    }
+    // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t>
+    gt_bits(const simd8<uint8_t> other) const
+    {
+        return this->saturating_sub(other);
+    }
+    // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+    simdutf_really_inline simd8<uint8_t>
+    lt_bits(const simd8<uint8_t> other) const
+    {
+        return other.saturating_sub(*this);
+    }
+    simdutf_really_inline simd8<bool>
+    operator<=(const simd8<uint8_t> other) const
+    {
+        return other.max_val(*this) == other;
+    }
+    simdutf_really_inline simd8<bool>
+    operator>=(const simd8<uint8_t> other) const
+    {
+        return other.min_val(*this) == other;
+    }
+    simdutf_really_inline simd8<bool>
+    operator>(const simd8<uint8_t> other) const
+    {
+        return this->gt_bits(other).any_bits_set();
+    }
+    simdutf_really_inline simd8<bool>
+    operator<(const simd8<uint8_t> other) const
+    {
+        return this->gt_bits(other).any_bits_set();
+    }
+
+    // Bit-specific operations
+    simdutf_really_inline simd8<bool> bits_not_set() const
+    {
+        return (__m128i)vec_cmpeq(this->value, (__m128i)vec_splats(uint8_t(0)));
+    }
+    simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const
+    {
+        return (*this & bits).bits_not_set();
+    }
+    simdutf_really_inline simd8<bool> any_bits_set() const
+    {
+        return ~this->bits_not_set();
+    }
+    simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const
+    {
+        return ~this->bits_not_set(bits);
+    }
+
+    simdutf_really_inline bool is_ascii() const
+    {
+        return this->saturating_sub(0b01111111u).bits_not_set_anywhere();
+    }
+
+    simdutf_really_inline bool bits_not_set_anywhere() const
+    {
+        return vec_all_eq(this->value, (__m128i)vec_splats(0));
+    }
+    simdutf_really_inline bool any_bits_set_anywhere() const
+    {
+        return !bits_not_set_anywhere();
+    }
+    simdutf_really_inline bool bits_not_set_anywhere(simd8<uint8_t> bits) const
+    {
+        return vec_all_eq(vec_and(this->value, (__m128i)bits),
+            (__m128i)vec_splats(0));
+    }
+    simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const
+    {
+        return !bits_not_set_anywhere(bits);
+    }
+    template<int N> simdutf_really_inline simd8<uint8_t> shr() const
+    {
+        return simd8<uint8_t>(
+            (__m128i)vec_sr(this->value, (__m128i)vec_splat_u8(N)));
+    }
+    template<int N> simdutf_really_inline simd8<uint8_t> shl() const
+    {
+        return simd8<uint8_t>(
+            (__m128i)vec_sl(this->value, (__m128i)vec_splat_u8(N)));
+    }
 };
 
-template <typename T> struct simd8x64 {
-  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
-  static_assert(NUM_CHUNKS == 4,
-                "PPC64 kernel should use four registers per 64-byte block.");
-  simd8<T> chunks[NUM_CHUNKS];
+template<typename T> struct simd8x64 {
+    static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+    static_assert(NUM_CHUNKS == 4,
+        "PPC64 kernel should use four registers per 64-byte block.");
+    simd8<T> chunks[NUM_CHUNKS];
 
-  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
-  simd8x64<T> &
-  operator=(const simd8<T> other) = delete; // no assignment allowed
-  simd8x64() = delete;                      // no default constructor allowed
+    simd8x64(const simd8x64<T>& o) = delete; // no copy allowed
+    simd8x64<T>&
+    operator=(const simd8<T> other)
+        = delete; // no assignment allowed
+    simd8x64() = delete; // no default constructor allowed
 
-  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
-                                  const simd8<T> chunk2, const simd8<T> chunk3)
-      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+    simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+        const simd8<T> chunk2, const simd8<T> chunk3)
+        : chunks { chunk0, chunk1, chunk2, chunk3 }
+    {
+    }
 
-  simdutf_really_inline simd8x64(const T* ptr) : chunks{simd8<T>::load(ptr), simd8<T>::load(ptr+sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+2*sizeof(simd8<T>)/sizeof(T)), simd8<T>::load(ptr+3*sizeof(simd8<T>)/sizeof(T))} {}
+    simdutf_really_inline simd8x64(const T* ptr)
+        : chunks { simd8<T>::load(ptr), simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)), simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T)) }
+    {
+    }
 
-  simdutf_really_inline void store(T* ptr) const {
-    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0/sizeof(T));
-    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1/sizeof(T));
-    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2/sizeof(T));
-    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3/sizeof(T));
-  }
+    simdutf_really_inline void store(T* ptr) const
+    {
+        this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+        this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+        this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+        this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+    }
 
+    simdutf_really_inline simd8x64<T>& operator|=(const simd8x64<T>& other)
+    {
+        this->chunks[0] |= other.chunks[0];
+        this->chunks[1] |= other.chunks[1];
+        this->chunks[2] |= other.chunks[2];
+        this->chunks[3] |= other.chunks[3];
+        return *this;
+    }
 
-  simdutf_really_inline simd8x64<T>& operator |=(const simd8x64<T> &other) {
-      this->chunks[0] |= other.chunks[0];
-      this->chunks[1] |= other.chunks[1];
-      this->chunks[2] |= other.chunks[2];
-      this->chunks[3] |= other.chunks[3];
-      return *this;
+    simdutf_really_inline simd8<T> reduce_or() const
+    {
+        return (this->chunks[0] | this->chunks[1]) | (this->chunks[2] | this->chunks[3]);
     }
 
-  simdutf_really_inline simd8<T> reduce_or() const {
-    return (this->chunks[0] | this->chunks[1]) |
-           (this->chunks[2] | this->chunks[3]);
-  }
+    simdutf_really_inline bool is_ascii() const
+    {
+        return input.reduce_or().is_ascii();
+    }
 
+    simdutf_really_inline uint64_t to_bitmask() const
+    {
+        uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
+        uint64_t r1 = this->chunks[1].to_bitmask();
+        uint64_t r2 = this->chunks[2].to_bitmask();
+        uint64_t r3 = this->chunks[3].to_bitmask();
+        return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    }
 
-  simdutf_really_inline bool is_ascii() const {
-    return input.reduce_or().is_ascii();
-  }
-
-  simdutf_really_inline uint64_t to_bitmask() const {
-    uint64_t r0 = uint32_t(this->chunks[0].to_bitmask());
-    uint64_t r1 = this->chunks[1].to_bitmask();
-    uint64_t r2 = this->chunks[2].to_bitmask();
-    uint64_t r3 = this->chunks[3].to_bitmask();
-    return r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
-  }
-
-  simdutf_really_inline uint64_t eq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
-                          this->chunks[2] == mask, this->chunks[3] == mask)
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
-    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
-                          this->chunks[1] == other.chunks[1],
-                          this->chunks[2] == other.chunks[2],
-                          this->chunks[3] == other.chunks[3])
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t lteq(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
-                          this->chunks[2] <= mask, this->chunks[3] <= mask)
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-
-      return  simd8x64<bool>(
-        (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
-        (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
-        (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
-        (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low)
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
-      const simd8<T> mask_low = simd8<T>::splat(low);
-      const simd8<T> mask_high = simd8<T>::splat(high);
-      return  simd8x64<bool>(
-        (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
-        (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
-        (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
-        (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low)
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t lt(const T m) const {
-    const simd8<T> mask = simd8<T>::splat(m);
-    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
-                          this->chunks[2] < mask, this->chunks[3] < mask)
-        .to_bitmask();
-  }
-
-  simdutf_really_inline uint64_t gt(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] > mask,
-        this->chunks[1] > mask,
-        this->chunks[2] > mask,
-        this->chunks[3] > mask
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t gteq(const T m) const {
-      const simd8<T> mask = simd8<T>::splat(m);
-      return  simd8x64<bool>(
-        this->chunks[0] >= mask,
-        this->chunks[1] >= mask,
-        this->chunks[2] >= mask,
-        this->chunks[3] >= mask
-      ).to_bitmask();
-  }
-  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
-      const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
-      return  simd8x64<bool>(
-        simd8<uint8_t>(this->chunks[0]) >= mask,
-        simd8<uint8_t>(this->chunks[1]) >= mask,
-        simd8<uint8_t>(this->chunks[2]) >= mask,
-        simd8<uint8_t>(this->chunks[3]) >= mask
-      ).to_bitmask();
-  }
+    simdutf_really_inline uint64_t eq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+            this->chunks[2] == mask, this->chunks[3] == mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t eq(const simd8x64<uint8_t>& other) const
+    {
+        return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+            this->chunks[1] == other.chunks[1],
+            this->chunks[2] == other.chunks[2],
+            this->chunks[3] == other.chunks[3])
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t lteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+            this->chunks[2] <= mask, this->chunks[3] <= mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+
+        return simd8x64<bool>(
+            (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+            (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+            (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+            (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t not_in_range(const T low, const T high) const
+    {
+        const simd8<T> mask_low = simd8<T>::splat(low);
+        const simd8<T> mask_high = simd8<T>::splat(high);
+        return simd8x64<bool>(
+            (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+            (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+            (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+            (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t lt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+            this->chunks[2] < mask, this->chunks[3] < mask)
+            .to_bitmask();
+    }
+
+    simdutf_really_inline uint64_t gt(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] > mask,
+            this->chunks[1] > mask,
+            this->chunks[2] > mask,
+            this->chunks[3] > mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq(const T m) const
+    {
+        const simd8<T> mask = simd8<T>::splat(m);
+        return simd8x64<bool>(
+            this->chunks[0] >= mask,
+            this->chunks[1] >= mask,
+            this->chunks[2] >= mask,
+            this->chunks[3] >= mask)
+            .to_bitmask();
+    }
+    simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const
+    {
+        const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+        return simd8x64<bool>(
+            simd8<uint8_t>(this->chunks[0]) >= mask,
+            simd8<uint8_t>(this->chunks[1]) >= mask,
+            simd8<uint8_t>(this->chunks[2]) >= mask,
+            simd8<uint8_t>(this->chunks[3]) >= mask)
+            .to_bitmask();
+    }
 }; // struct simd8x64<T>
 
 } // namespace simd
@@ -3935,7 +4706,7 @@ template <typename T> struct simd8x64 {
 #endif // SIMDUTF_PPC64_SIMD_INPUT_H
 /* end file src/simdutf/ppc64/simd.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/end.h
 /* begin file src/simdutf/ppc64/end.h */
 /* end file src/simdutf/ppc64/end.h */
 
@@ -3943,12 +4714,11 @@ template <typename T> struct simd8x64 {
 
 #endif // SIMDUTF_PPC64_H
 /* end file src/simdutf/ppc64.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback.h
 /* begin file src/simdutf/fallback.h */
 #ifndef SIMDUTF_FALLBACK_H
 #define SIMDUTF_FALLBACK_H
 
-
 // Note that fallback.h is always imported last.
 
 // Default Fallback to on unless a builtin implementation has already been selected.
@@ -3972,12 +4742,11 @@ namespace fallback {
 } // namespace fallback
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/implementation.h
 /* begin file src/simdutf/fallback/implementation.h */
 #ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
 #define SIMDUTF_FALLBACK_IMPLEMENTATION_H
 
-
 namespace simdutf {
 namespace fallback {
 
@@ -3987,64 +4756,88 @@ using namespace simdutf;
 
 class implementation final : public simdutf::implementation {
 public:
-  simdutf_really_inline implementation() : simdutf::implementation(
-      "fallback",
-      "Generic fallback implementation",
-      0
-  ) {}
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept final;
-  simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16le(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf16be(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) const noexcept final;
-  void change_endianness_utf16(const char16_t * buf, size_t length, char16_t * output) const noexcept final;
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept;
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept;
+    simdutf_really_inline implementation()
+        : simdutf::implementation(
+            "fallback",
+            "Generic fallback implementation",
+            0)
+    {
+    }
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept final;
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) const noexcept final;
+    void change_endianness_utf16(const char16_t* buf, size_t length, char16_t* output) const noexcept final;
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* input, size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) const noexcept;
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) const noexcept;
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* input, size_t length) const noexcept;
 };
 
 } // namespace fallback
@@ -4053,14 +4846,14 @@ public:
 #endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
 /* end file src/simdutf/fallback/implementation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/begin.h
 /* begin file src/simdutf/fallback/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
 // #define SIMDUTF_IMPLEMENTATION fallback
 /* end file src/simdutf/fallback/begin.h */
 
 // Declarations
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/bitmanipulation.h
 /* begin file src/simdutf/fallback/bitmanipulation.h */
 #ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
 #define SIMDUTF_FALLBACK_BITMANIPULATION_H
@@ -4072,19 +4865,21 @@ namespace fallback {
 namespace {
 
 #if defined(_MSC_VER) && !defined(_M_ARM64) && !defined(_M_X64)
-static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x) {
-  unsigned long x0 = (unsigned long)x, top, bottom;
-  _BitScanForward(&top, (unsigned long)(x >> 32));
-  _BitScanForward(&bottom, x0);
-  *ret = x0 ? bottom : 32 + top;
-  return x != 0;
-}
-static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
-  unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
-  _BitScanReverse(&top, x1);
-  _BitScanReverse(&bottom, (unsigned long)x);
-  *ret = x1 ? top + 32 : bottom;
-  return x != 0;
+static inline unsigned char _BitScanForward64(unsigned long* ret, uint64_t x)
+{
+    unsigned long x0 = (unsigned long)x, top, bottom;
+    _BitScanForward(&top, (unsigned long)(x >> 32));
+    _BitScanForward(&bottom, x0);
+    *ret = x0 ? bottom : 32 + top;
+    return x != 0;
+}
+static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x)
+{
+    unsigned long x1 = (unsigned long)(x >> 32), top, bottom;
+    _BitScanReverse(&top, x1);
+    _BitScanReverse(&bottom, (unsigned long)x);
+    *ret = x1 ? top + 32 : bottom;
+    return x != 0;
 }
 #endif
 
@@ -4095,7 +4890,7 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
 #endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
 /* end file src/simdutf/fallback/bitmanipulation.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/end.h
 /* begin file src/simdutf/fallback/end.h */
 /* end file src/simdutf/fallback/end.h */
 
@@ -4104,16 +4899,20 @@ static unsigned char _BitScanReverse64(unsigned long* ret, uint64_t x) {
 /* end file src/simdutf/fallback.h */
 
 namespace simdutf {
-bool implementation::supported_by_runtime_system() const {
-  uint32_t required_instruction_sets = this->required_instruction_sets();
-  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
-  return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
+bool implementation::supported_by_runtime_system() const
+{
+    uint32_t required_instruction_sets = this->required_instruction_sets();
+    uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+    return ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets);
 }
 
-simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char * input, size_t length) const noexcept {
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char* input, size_t length) const noexcept
+{
     // If there is a BOM, then we trust it.
     auto bom_encoding = simdutf::BOM::check_bom(input, length);
-    if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
     // UTF8 is common, it includes ASCII, and is commonly represented
     // without a BOM, so if it fits, go with that. Note that it is still
     // possible to get it wrong, we are only 'guessing'. If some has UTF-16
@@ -4121,15 +4920,21 @@ simdutf_warn_unused encoding_type implementation::autodetect_encoding(const char
     //
     // An interesting twist might be to check for UTF-16 ASCII first (every
     // other byte is zero).
-    if(validate_utf8(input, length)) { return encoding_type::UTF8; }
+    if (validate_utf8(input, length)) {
+        return encoding_type::UTF8;
+    }
     // The next most common encoding that might appear without BOM is probably
     // UTF-16LE, so try that next.
-    if((length % 2) == 0) {
-      // important: we need to divide by two
-      if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { return encoding_type::UTF16_LE; }
+    if ((length % 2) == 0) {
+        // important: we need to divide by two
+        if (validate_utf16le(reinterpret_cast<const char16_t*>(input), length / 2)) {
+            return encoding_type::UTF16_LE;
+        }
     }
-    if((length % 4) == 0) {
-      if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { return encoding_type::UTF32_LE; }
+    if ((length % 4) == 0) {
+        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
+            return encoding_type::UTF32_LE;
+        }
     }
     return encoding_type::unspecified;
 }
@@ -4139,24 +4944,47 @@ namespace internal {
 // Static array of known implementations. We're hoping these get baked into the executable
 // without requiring a static initializer.
 
-
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-const icelake::implementation icelake_singleton{};
+static const icelake::implementation* get_icelake_singleton()
+{
+    static const icelake::implementation icelake_singleton {};
+    return &icelake_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-const haswell::implementation haswell_singleton{};
+static const haswell::implementation* get_haswell_singleton()
+{
+    static const haswell::implementation haswell_singleton {};
+    return &haswell_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-const westmere::implementation westmere_singleton{};
+static const westmere::implementation* get_westmere_singleton()
+{
+    static const westmere::implementation westmere_singleton {};
+    return &westmere_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_ARM64
-const arm64::implementation arm64_singleton{};
+static const arm64::implementation* get_arm64_singleton()
+{
+    static const arm64::implementation arm64_singleton {};
+    return &arm64_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-const ppc64::implementation ppc64_singleton{};
+static const ppc64::implementation* get_ppc64_singleton()
+{
+    static const ppc64::implementation ppc64_singleton {};
+    return &ppc64_singleton;
+}
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-const fallback::implementation fallback_singleton{};
+static const fallback::implementation* get_fallback_singleton()
+{
+    static const fallback::implementation fallback_singleton {};
+    return &fallback_singleton;
+}
 #endif
 
 /**
@@ -4164,831 +4992,1275 @@ const fallback::implementation fallback_singleton{};
  */
 class detect_best_supported_implementation_on_first_use final : public implementation {
 public:
-  const std::string &name() const noexcept final { return set_best()->name(); }
-  const std::string &description() const noexcept final { return set_best()->description(); }
-  uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+    const std::string& name() const noexcept final { return set_best()->name(); }
+    const std::string& description() const noexcept final { return set_best()->description(); }
+    uint32_t required_instruction_sets() const noexcept final { return set_best()->required_instruction_sets(); }
+
+    simdutf_warn_unused int detect_encodings(const char* input, size_t length) const noexcept override
+    {
+        return set_best()->detect_encodings(input, length);
+    }
+
+    simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf8(buf, len);
+    }
+
+    simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf8_with_errors(buf, len);
+    }
+
+    simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_ascii(buf, len);
+    }
+
+    simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_ascii_with_errors(buf, len);
+    }
+
+    simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16le(buf, len);
+    }
+
+    simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16be(buf, len);
+    }
+
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16le_with_errors(buf, len);
+    }
 
-  simdutf_warn_unused int detect_encodings(const char * input, size_t length) const noexcept override {
-    return set_best()->detect_encodings(input, length);
-  }
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf16be_with_errors(buf, len);
+    }
 
-  simdutf_warn_unused bool validate_utf8(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8(buf, len);
-  }
+    simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf32(buf, len);
+    }
 
-  simdutf_warn_unused result validate_utf8_with_errors(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8_with_errors(buf, len);
-  }
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->validate_utf32_with_errors(buf, len);
+    }
 
-  simdutf_warn_unused bool validate_ascii(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result validate_ascii_with_errors(const char * buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii_with_errors(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16le(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16le_with_errors(buf, len);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16be_with_errors(buf, len);
-  }
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf32(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf32_with_errors(buf, len);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32_with_errors(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
-  }
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_output);
+    }
 
-  void change_endianness_utf16(const char16_t * buf, size_t len, char16_t * output) const noexcept final override {
-    set_best()->change_endianness_utf16(buf, len, output);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t count_utf16le(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16le(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
+    }
 
-  simdutf_warn_unused size_t count_utf16be(const char16_t * buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
+    }
+
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
+    }
+
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t count_utf8(const char * buf, size_t len) const noexcept final override {
-    return set_best()->count_utf8(buf, len);
-  }
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16le(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept final override
+    {
+        return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16le(buf, len);
-  }
+    void change_endianness_utf16(const char16_t* buf, size_t len, char16_t* output) const noexcept final override
+    {
+        set_best()->change_endianness_utf16(buf, len, output);
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16be(buf, len);
-  }
+    simdutf_warn_unused size_t count_utf16le(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->count_utf16le(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char * buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf8(buf, len);
-  }
+    simdutf_warn_unused size_t count_utf16be(const char16_t* buf, size_t len) const noexcept final override
+    {
+        return set_best()->count_utf16be(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf32(buf, len);
-  }
+    simdutf_warn_unused size_t count_utf8(const char* buf, size_t len) const noexcept final override
+    {
+        return set_best()->count_utf8(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf32(buf, len);
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->latin1_length_from_utf8(buf, len);
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char * buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf8(buf, len);
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) const noexcept override
+    {
+        return set_best()->latin1_length_from_utf16(len);
+    }
 
-  simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0) {}
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) const noexcept override
+    {
+        return set_best()->latin1_length_from_utf32(len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_latin1(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_utf16le(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_utf16be(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t len) const noexcept override
+    {
+        return set_best()->utf16_length_from_latin1(len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_latin1(len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_utf16le(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_utf16be(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf16_length_from_utf8(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf8_length_from_utf32(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf16_length_from_utf32(buf, len);
+    }
+
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char* buf, size_t len) const noexcept override
+    {
+        return set_best()->utf32_length_from_utf8(buf, len);
+    }
+
+    simdutf_really_inline detect_best_supported_implementation_on_first_use() noexcept
+        : implementation("best_supported_detector", "Detects the best supported implementation and sets it", 0)
+    {
+    }
 
 private:
-  const implementation *set_best() const noexcept;
+    const implementation* set_best() const noexcept;
 };
 
-
-const std::initializer_list<const implementation *> available_implementation_pointers {
+static const std::initializer_list<const implementation*>& get_available_implementation_pointers()
+{
+    static const std::initializer_list<const implementation*> available_implementation_pointers
+    {
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-  &icelake_singleton,
+        get_icelake_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-  &haswell_singleton,
+            get_haswell_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-  &westmere_singleton,
+            get_westmere_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_ARM64
-  &arm64_singleton,
+            get_arm64_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-  &ppc64_singleton,
+            get_ppc64_singleton(),
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-  &fallback_singleton,
+            get_fallback_singleton(),
 #endif
-}; // available_implementation_pointers
+    }; // available_implementation_pointers
+    return available_implementation_pointers;
+}
 
 // So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no support
 class unsupported_implementation final : public implementation {
 public:
-  simdutf_warn_unused int detect_encodings(const char *, size_t) const noexcept override {
-    return encoding_type::unspecified;
-  }
-
-  simdutf_warn_unused bool validate_utf8(const char *, size_t) const noexcept final override {
-    return false; // Just refuse to validate. Given that we have a fallback implementation
-    // it seems unlikely that unsupported_implementation will ever be used. If it is used,
-    // then it will flag all strings as invalid. The alternative is to return an error_code
-    // from which the user has to figure out whether the string is valid UTF-8... which seems
-    // like a lot of work just to handle the very unlikely case that we have an unsupported
-    // implementation. And, when it does happen (that we have an unsupported implementation),
-    // what are the chances that the programmer has a fallback? Given that *we* provide the
-    // fallback, it implies that the programmer would need a fallback for our fallback.
-  }
-
-  simdutf_warn_unused result validate_utf8_with_errors(const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused bool validate_ascii(const char *, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused result validate_ascii_with_errors(const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override {
-    return false;
-  }
-
-  simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
-
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused int detect_encodings(const char*, size_t) const noexcept override
+    {
+        return encoding_type::unspecified;
+    }
+
+    simdutf_warn_unused bool validate_utf8(const char*, size_t) const noexcept final override
+    {
+        return false; // Just refuse to validate. Given that we have a fallback implementation
+        // it seems unlikely that unsupported_implementation will ever be used. If it is used,
+        // then it will flag all strings as invalid. The alternative is to return an error_code
+        // from which the user has to figure out whether the string is valid UTF-8... which seems
+        // like a lot of work just to handle the very unlikely case that we have an unsupported
+        // implementation. And, when it does happen (that we have an unsupported implementation),
+        // what are the chances that the programmer has a fallback? Given that *we* provide the
+        // fallback, it implies that the programmer would need a fallback for our fallback.
+    }
+
+    simdutf_warn_unused result validate_utf8_with_errors(const char*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused bool validate_ascii(const char*, size_t) const noexcept final override
+    {
+        return false;
+    }
+
+    simdutf_warn_unused result validate_ascii_with_errors(const char*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused bool validate_utf16le(const char16_t*, size_t) const noexcept final override
+    {
+        return false;
+    }
+
+    simdutf_warn_unused bool validate_utf16be(const char16_t*, size_t) const noexcept final override
+    {
+        return false;
+    }
+
+    simdutf_warn_unused result validate_utf16le_with_errors(const char16_t*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused result validate_utf16be_with_errors(const char16_t*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused bool validate_utf32(const char32_t*, size_t) const noexcept final override
+    {
+        return false;
+    }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result validate_utf32_with_errors(const char32_t*, size_t) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf8(const char*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_latin1_to_utf32(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf8_to_latin1(const char*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
+
+    simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
+
+    simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t*, size_t, char*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t*, size_t, char*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t*, size_t, char16_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return result(error_code::OTHER, 0);
+    }
 
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+    simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    void change_endianness_utf16(const char16_t*, size_t, char16_t*) const noexcept final override
+    {
+    }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t*, size_t, char32_t*) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t count_utf16le(const char16_t*, size_t) const noexcept final override
+    {
+        return 0;
+    }
 
-  void change_endianness_utf16(const char16_t *, size_t, char16_t *) const noexcept final override {
+    simdutf_warn_unused size_t count_utf16be(const char16_t*, size_t) const noexcept final override
+    {
+        return 0;
+    }
 
-  }
+    simdutf_warn_unused size_t count_utf8(const char*, size_t) const noexcept final override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t count_utf16le(const char16_t *, size_t) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t count_utf16be(const char16_t *, size_t) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf16(size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t count_utf8(const char *, size_t) const noexcept final override {
-    return 0;
-  }
+    simdutf_warn_unused size_t latin1_length_from_utf32(size_t) const noexcept override
+    {
+        return 0;
+    }
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_latin1(size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf16_length_from_utf8(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
+    simdutf_warn_unused size_t utf16_length_from_latin1(size_t) const noexcept override
+    {
+        return 0;
+    }
+    simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  simdutf_warn_unused size_t utf32_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
-  }
+    simdutf_warn_unused size_t utf32_length_from_utf8(const char*, size_t) const noexcept override
+    {
+        return 0;
+    }
 
-  unsupported_implementation() : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0) {}
+    unsupported_implementation()
+        : implementation("unsupported", "Unsupported CPU (no detected SIMD instructions)", 0)
+    {
+    }
 };
 
-const unsupported_implementation unsupported_singleton{};
+const unsupported_implementation unsupported_singleton {};
 
-size_t available_implementation_list::size() const noexcept {
-  return internal::available_implementation_pointers.size();
+size_t available_implementation_list::size() const noexcept
+{
+    return internal::get_available_implementation_pointers().size();
 }
-const implementation * const *available_implementation_list::begin() const noexcept {
-  return internal::available_implementation_pointers.begin();
+const implementation* const* available_implementation_list::begin() const noexcept
+{
+    return internal::get_available_implementation_pointers().begin();
 }
-const implementation * const *available_implementation_list::end() const noexcept {
-  return internal::available_implementation_pointers.end();
+const implementation* const* available_implementation_list::end() const noexcept
+{
+    return internal::get_available_implementation_pointers().end();
 }
-const implementation *available_implementation_list::detect_best_supported() const noexcept {
-  // They are prelisted in priority order, so we just go down the list
-  uint32_t supported_instruction_sets = internal::detect_supported_architectures();
-  for (const implementation *impl : internal::available_implementation_pointers) {
-    uint32_t required_instruction_sets = impl->required_instruction_sets();
-    if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) { return impl; }
-  }
-  return &unsupported_singleton; // this should never happen?
+const implementation* available_implementation_list::detect_best_supported() const noexcept
+{
+    // They are prelisted in priority order, so we just go down the list
+    uint32_t supported_instruction_sets = internal::detect_supported_architectures();
+    for (const implementation* impl : internal::get_available_implementation_pointers()) {
+        uint32_t required_instruction_sets = impl->required_instruction_sets();
+        if ((supported_instruction_sets & required_instruction_sets) == required_instruction_sets) {
+            return impl;
+        }
+    }
+    return &unsupported_singleton; // this should never happen?
 }
 
-const implementation *detect_best_supported_implementation_on_first_use::set_best() const noexcept {
-  SIMDUTF_PUSH_DISABLE_WARNINGS
-  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
-  char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
-  SIMDUTF_POP_DISABLE_WARNINGS
+const implementation* detect_best_supported_implementation_on_first_use::set_best() const noexcept
+{
+    SIMDUTF_PUSH_DISABLE_WARNINGS
+    SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC: manually verified this is safe
+        char* force_implementation_name
+        = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+    SIMDUTF_POP_DISABLE_WARNINGS
 
-  if (force_implementation_name) {
-    auto force_implementation = get_available_implementations()[force_implementation_name];
-    if (force_implementation) {
-      return get_active_implementation() = force_implementation;
-    } else {
-      // Note: abort() and stderr usage within the library is forbidden.
-      return get_active_implementation() = &unsupported_singleton;
+    if (force_implementation_name) {
+        auto force_implementation = get_available_implementations()[force_implementation_name];
+        if (force_implementation) {
+            return get_active_implementation() = force_implementation;
+        } else {
+            // Note: abort() and stderr usage within the library is forbidden.
+            return get_active_implementation() = &unsupported_singleton;
+        }
     }
-  }
-  return get_active_implementation() = get_available_implementations().detect_best_supported();
+    return get_active_implementation() = get_available_implementations().detect_best_supported();
 }
 
 } // namespace internal
 
-
-
 /**
  * The list of available implementations compiled into simdutf.
  */
-SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() {
-  static const internal::available_implementation_list available_implementations{};
-  return available_implementations;
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations()
+{
+    static const internal::available_implementation_list available_implementations {};
+    return available_implementations;
 }
 
 /**
-  * The active implementation.
-  */
-SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation() {
+ * The active implementation.
+ */
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation>& get_active_implementation()
+{
     static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton;
-    static internal::atomic_ptr<const implementation> active_implementation{&detect_best_supported_implementation_on_first_use_singleton};
+    static internal::atomic_ptr<const implementation> active_implementation { &detect_best_supported_implementation_on_first_use_singleton };
     return active_implementation;
 }
 
-simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf8(buf, len);
+simdutf_warn_unused bool validate_utf8(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf8(buf, len);
 }
-simdutf_warn_unused result validate_utf8_with_errors(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result validate_utf8_with_errors(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf8_with_errors(buf, len);
 }
-simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_ascii(buf, len);
+simdutf_warn_unused bool validate_ascii(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_ascii(buf, len);
 }
-simdutf_warn_unused result validate_ascii_with_errors(const char *buf, size_t len) noexcept {
-  return get_active_implementation()->validate_ascii_with_errors(buf, len);
+simdutf_warn_unused result validate_ascii_with_errors(const char* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_ascii_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be(input, length, utf16_output);
-  #else
-  return convert_utf8_to_utf16le(input, length, utf16_output);
-  #endif
+simdutf_warn_unused size_t convert_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf8_to_utf16be(input, length, utf16_output);
+#else
+    return convert_utf8_to_utf16le(input, length, utf16_output);
+#endif
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
+simdutf_warn_unused size_t convert_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16le(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
+simdutf_warn_unused size_t convert_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16be(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
-  #else
-  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
-  #endif
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+#else
+    return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+#endif
 }
-simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char * input, size_t length, char16_t* utf16_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
+simdutf_warn_unused size_t convert_latin1_to_utf16le(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_latin1_to_utf16le(input, length, utf16_output);
 }
-simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept {
-  return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
+simdutf_warn_unused size_t convert_latin1_to_utf16be(const char* input, size_t length, char16_t* utf16_output) noexcept
+{
+    return get_active_implementation()->convert_latin1_to_utf16be(input, length, utf16_output);
 }
-simdutf_warn_unused bool validate_utf16(const char16_t * buf, size_t len) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be(buf, len);
-  #else
-  return validate_utf16le(buf, len);
-  #endif
+simdutf_warn_unused size_t convert_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf32(input, length, utf32_output);
 }
-simdutf_warn_unused bool validate_utf16le(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16le(buf, len);
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char* input, size_t length, char32_t* utf32_output) noexcept
+{
+    return get_active_implementation()->convert_utf8_to_utf32_with_errors(input, length, utf32_output);
 }
-simdutf_warn_unused bool validate_utf16be(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16be(buf, len);
+simdutf_warn_unused bool validate_utf16(const char16_t* buf, size_t len) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return validate_utf16be(buf, len);
+#else
+    return validate_utf16le(buf, len);
+#endif
 }
-simdutf_warn_unused result validate_utf16_with_errors(const char16_t * buf, size_t len) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be_with_errors(buf, len);
-  #else
-  return validate_utf16le_with_errors(buf, len);
-  #endif
+simdutf_warn_unused bool validate_utf16le(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16le(buf, len);
 }
-simdutf_warn_unused result validate_utf16le_with_errors(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16le_with_errors(buf, len);
+simdutf_warn_unused bool validate_utf16be(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16be(buf, len);
 }
-simdutf_warn_unused result validate_utf16be_with_errors(const char16_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf16be_with_errors(buf, len);
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t* buf, size_t len) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return validate_utf16be_with_errors(buf, len);
+#else
+    return validate_utf16le_with_errors(buf, len);
+#endif
 }
-simdutf_warn_unused bool validate_utf32(const char32_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf32(buf, len);
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16le_with_errors(buf, len);
 }
-simdutf_warn_unused result validate_utf32_with_errors(const char32_t * buf, size_t len) noexcept {
-  return get_active_implementation()->validate_utf32_with_errors(buf, len);
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf16be_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
-  #else
-  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
-  #endif
+simdutf_warn_unused bool validate_utf32(const char32_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf32(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t* buf, size_t len) noexcept
+{
+    return get_active_implementation()->validate_utf32_with_errors(buf, len);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(const char* input, size_t length, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+#else
+    return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(const char* input, size_t length, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
-  #else
-  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
-  #endif
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char* input, size_t length, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char* input, size_t length, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf8_to_utf32(input, length, utf32_buffer);
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+    return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
 }
-simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
-  #else
-  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
-  #endif
-}
-simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
-}
-simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_latin1(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  #if BIG_ENDIAN
-  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
-  #else
-  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
-  #endif
-}
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_latin1(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t * buf, size_t len, char* utf8_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
-  #else
-  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
-  #endif
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
-  #else
-  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
-  #endif
-}
-simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+#else
+    return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
-  #else
-  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
-  #endif
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t * buf, size_t len, char16_t* utf16_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+    return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
-  #else
-  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
-  #endif
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
-  #else
-  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
-  #endif
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf8_with_errors(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf32_to_utf8(buf, len, utf8_buffer);
 }
-simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+    return convert_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
-  #else
-  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
-  #endif
+simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t* input, size_t length, char* latin1_output) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_latin1(input, length, latin1_output);
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16le(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * buf, size_t len, char32_t* utf32_buffer) noexcept {
-  return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+simdutf_warn_unused size_t convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16be(buf, len, utf16_buffer);
 }
-void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) noexcept {
-  get_active_implementation()->change_endianness_utf16(input, length, output);
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+#else
+    return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+#endif
 }
-simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return count_utf16be(input, length);
-  #else
-  return count_utf16le(input, length);
-  #endif
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf16le(input, length);
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf16be(input, length);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+    return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t count_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->count_utf8(input, length);
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
 }
-simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t length) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return utf8_length_from_utf16be(input, length);
-  #else
-  return utf8_length_from_utf16le(input, length);
-  #endif
+simdutf_warn_unused size_t convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+    return convert_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf16le(input, length);
+simdutf_warn_unused size_t convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf32(buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf32(buf, len, utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+#else
+    return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+#endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf16be(input, length);
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_t length) noexcept {
-  #if SIMDUTF_IS_BIG_ENDIAN
-  return utf32_length_from_utf16be(input, length);
-  #else
-  return utf32_length_from_utf16le(input, length);
-  #endif
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf16le(input, length);
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+    return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
 }
-simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf16be(input, length);
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_buffer) noexcept
+{
+    return get_active_implementation()->convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
 }
-simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf8_length_from_utf32(input, length);
+void change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) noexcept
+{
+    get_active_implementation()->change_endianness_utf16(input, length, output);
 }
-simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_t length) noexcept {
-  return get_active_implementation()->utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t count_utf16(const char16_t* input, size_t length) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return count_utf16be(input, length);
+#else
+    return count_utf16le(input, length);
+#endif
 }
-simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t length) noexcept {
-  return get_active_implementation()->utf32_length_from_utf8(input, length);
+simdutf_warn_unused size_t count_utf16le(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->count_utf16le(input, length);
 }
-simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char * buf, size_t length) noexcept {
-  return get_active_implementation()->autodetect_encoding(buf, length);
+simdutf_warn_unused size_t count_utf16be(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->count_utf16be(input, length);
 }
-simdutf_warn_unused int detect_encodings(const char * buf, size_t length) noexcept {
-  return get_active_implementation()->detect_encodings(buf, length);
+simdutf_warn_unused size_t count_utf8(const char* input, size_t length) noexcept
+{
+    return get_active_implementation()->count_utf8(input, length);
 }
-
-const implementation * builtin_implementation() {
-  static const implementation * builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
-  return builtin_impl;
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t* input, size_t length) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return utf8_length_from_utf16be(input, length);
+#else
+    return utf8_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf8_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf8_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t* input, size_t length) noexcept
+{
+#if SIMDUTF_IS_BIG_ENDIAN
+    return utf32_length_from_utf16be(input, length);
+#else
+    return utf32_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf32_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf32_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf8(const char* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf16_length_from_utf8(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept
+{
+    return get_active_implementation()->utf16_length_from_latin1(length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf8_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf16_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf8(const char* input, size_t length) noexcept
+{
+    return get_active_implementation()->utf32_length_from_utf8(input, length);
+}
+simdutf_warn_unused simdutf::encoding_type autodetect_encoding(const char* buf, size_t length) noexcept
+{
+    return get_active_implementation()->autodetect_encoding(buf, length);
+}
+simdutf_warn_unused int detect_encodings(const char* buf, size_t length) noexcept
+{
+    return get_active_implementation()->detect_encodings(buf, length);
 }
 
+const implementation* builtin_implementation()
+{
+    static const implementation* builtin_impl = get_available_implementations()[SIMDUTF_STRINGIFY(SIMDUTF_BUILTIN_IMPLEMENTATION)];
+    return builtin_impl;
+}
 
 } // namespace simdutf
 
 /* end file src/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=encoding_types.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=encoding_types.cpp
 /* begin file src/encoding_types.cpp */
 
 namespace simdutf {
-bool match_system(endianness e) {
+bool match_system(endianness e)
+{
 #if SIMDUTF_IS_BIG_ENDIAN
     return e == endianness::BIG;
 #else
@@ -4996,69 +6268,91 @@ bool match_system(endianness e) {
 #endif
 }
 
-std::string to_string(encoding_type bom) {
-  switch (bom) {
-      case UTF16_LE:     return "UTF16 little-endian";
-      case UTF16_BE:     return "UTF16 big-endian";
-      case UTF32_LE:     return "UTF32 little-endian";
-      case UTF32_BE:     return "UTF32 big-endian";
-      case UTF8:         return "UTF8";
-      case unspecified:  return "unknown";
-      default:           return "error";
-  }
+std::string to_string(encoding_type bom)
+{
+    switch (bom) {
+    case UTF16_LE:
+        return "UTF16 little-endian";
+    case UTF16_BE:
+        return "UTF16 big-endian";
+    case UTF32_LE:
+        return "UTF32 little-endian";
+    case UTF32_BE:
+        return "UTF32 big-endian";
+    case UTF8:
+        return "UTF8";
+    case unspecified:
+        return "unknown";
+    default:
+        return "error";
+    }
 }
 
 namespace BOM {
 // Note that BOM for UTF8 is discouraged.
-encoding_type check_bom(const uint8_t* byte, size_t length) {
-        if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
-            if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
-                return encoding_type::UTF32_LE;
-            } else {
-                return encoding_type::UTF16_LE;
-            }
-        } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
-            return encoding_type::UTF16_BE;
-        } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
-            return encoding_type::UTF32_BE;
-        } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
-            return encoding_type::UTF8;
+encoding_type check_bom(const uint8_t* byte, size_t length)
+{
+    if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+        if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+            return encoding_type::UTF32_LE;
+        } else {
+            return encoding_type::UTF16_LE;
         }
-        return encoding_type::unspecified;
+    } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+        return encoding_type::UTF16_BE;
+    } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and byte[2] == 0xfe and byte[3] == 0xff) {
+        return encoding_type::UTF32_BE;
+    } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and byte[3] == 0xbf) {
+        return encoding_type::UTF8;
     }
+    return encoding_type::unspecified;
+}
 
-encoding_type check_bom(const char* byte, size_t length) {
-      return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
- }
-
- size_t bom_byte_size(encoding_type bom) {
-        switch (bom) {
-            case UTF16_LE:     return 2;
-            case UTF16_BE:     return 2;
-            case UTF32_LE:     return 4;
-            case UTF32_BE:     return 4;
-            case UTF8:         return 3;
-            case unspecified:  return 0;
-            default:           return 0;
-        }
+encoding_type check_bom(const char* byte, size_t length)
+{
+    return check_bom(reinterpret_cast<const uint8_t*>(byte), length);
+}
+
+size_t bom_byte_size(encoding_type bom)
+{
+    switch (bom) {
+    case UTF16_LE:
+        return 2;
+    case UTF16_BE:
+        return 2;
+    case UTF32_LE:
+        return 4;
+    case UTF32_BE:
+        return 4;
+    case UTF8:
+        return 3;
+    case unspecified:
+        return 0;
+    default:
+        return 0;
+    }
 }
 
 }
 }
 /* end file src/encoding_types.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=error.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=error.cpp
 /* begin file src/error.cpp */
 namespace simdutf {
 
-  simdutf_really_inline result::result() : error{error_code::SUCCESS}, count{0} {};
+simdutf_really_inline result::result()
+    : error { error_code::SUCCESS }
+    , count { 0 } {};
 
-  simdutf_really_inline result::result(error_code _err, size_t _pos) : error{_err}, count{_pos} {};
+simdutf_really_inline result::result(error_code _err, size_t _pos)
+    : error { _err }
+    , count { _pos } {};
 
 }
 /* end file src/error.cpp */
 // The large tables should be included once and they
 // should not depend on a kernel.
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf8_to_utf16_tables.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=tables/utf8_to_utf16_tables.h
 /* begin file src/tables/utf8_to_utf16_tables.h */
 #ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
 #define SIMDUTF_UTF8_TO_UTF16_TABLES_H
@@ -5080,4316 +6374,4314 @@ namespace utf8_to_utf16 {
  * performance penalty.
  */
 
-const uint8_t shufutf8[209][16] =
-{	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
- 	{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
- 	{1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
- 	{2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
- 	{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
+const uint8_t shufutf8[209][16] = { { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0 },
+    { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0 },
+    { 1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0 },
+    { 2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0 },
+    { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0 } };
 /* number of two bytes : 64 */
 /* number of two + three bytes : 145 */
 /* number of two + three + four bytes : 209 */
-const uint8_t utf8bigindex[4096][2] =
-{	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{0, 12},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{171, 8},
- 	{72, 8},
- 	{183, 8},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{186, 8},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{187, 9},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{77, 7},
- 	{95, 7},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{192, 11},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{204, 11},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{207, 11},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{117, 11},
- 	{72, 8},
- 	{135, 11},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{141, 11},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{143, 11},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{31, 11},
- 	{47, 11},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{55, 11},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{59, 11},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{61, 11},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{62, 11},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{0, 12},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{208, 12},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{171, 8},
- 	{72, 8},
- 	{183, 8},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{186, 8},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{144, 12},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{77, 7},
- 	{95, 7},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{63, 12},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{148, 6},
- 	{0, 12},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{0, 12},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{192, 11},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{204, 11},
- 	{155, 7},
- 	{167, 7},
- 	{69, 7},
- 	{179, 7},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{170, 7},
- 	{71, 7},
- 	{182, 7},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{185, 7},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{207, 11},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{117, 11},
- 	{72, 8},
- 	{135, 11},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{141, 11},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{104, 8},
- 	{68, 6},
- 	{122, 8},
- 	{74, 6},
- 	{92, 6},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{76, 6},
- 	{94, 6},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{77, 7},
- 	{95, 7},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{143, 11},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{31, 11},
- 	{47, 11},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{55, 11},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{59, 11},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{61, 11},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{147, 5},
- 	{0, 12},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{0, 12},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{176, 10},
- 	{148, 6},
- 	{188, 10},
- 	{151, 6},
- 	{163, 6},
- 	{66, 6},
- 	{200, 10},
- 	{154, 6},
- 	{166, 6},
- 	{68, 6},
- 	{178, 6},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{169, 6},
- 	{70, 6},
- 	{181, 6},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{191, 10},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{203, 10},
- 	{90, 10},
- 	{108, 10},
- 	{69, 7},
- 	{126, 10},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{114, 10},
- 	{71, 7},
- 	{132, 10},
- 	{77, 7},
- 	{95, 7},
- 	{65, 5},
- 	{194, 7},
- 	{83, 7},
- 	{101, 7},
- 	{67, 5},
- 	{119, 7},
- 	{73, 5},
- 	{91, 5},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{138, 10},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{103, 7},
- 	{68, 6},
- 	{121, 7},
- 	{74, 6},
- 	{92, 6},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{76, 6},
- 	{94, 6},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{206, 10},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{116, 10},
- 	{72, 8},
- 	{134, 10},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{140, 10},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{62, 11},
- 	{15, 10},
- 	{122, 8},
- 	{23, 10},
- 	{39, 10},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{27, 10},
- 	{43, 10},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{51, 10},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{29, 10},
- 	{45, 10},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{53, 10},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{57, 10},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{146, 4},
- 	{0, 12},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{160, 9},
- 	{172, 9},
- 	{147, 5},
- 	{184, 9},
- 	{150, 5},
- 	{162, 5},
- 	{65, 5},
- 	{196, 9},
- 	{153, 5},
- 	{165, 5},
- 	{67, 5},
- 	{177, 5},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{175, 9},
- 	{148, 6},
- 	{142, 10},
- 	{81, 9},
- 	{99, 9},
- 	{66, 6},
- 	{199, 9},
- 	{87, 9},
- 	{105, 9},
- 	{68, 6},
- 	{123, 9},
- 	{74, 6},
- 	{92, 6},
- 	{64, 4},
- 	{0, 12},
- 	{157, 6},
- 	{111, 9},
- 	{70, 6},
- 	{129, 9},
- 	{76, 6},
- 	{94, 6},
- 	{65, 5},
- 	{193, 6},
- 	{82, 6},
- 	{100, 6},
- 	{67, 5},
- 	{118, 6},
- 	{73, 5},
- 	{91, 5},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{190, 9},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{202, 9},
- 	{89, 9},
- 	{107, 9},
- 	{69, 7},
- 	{125, 9},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{113, 9},
- 	{71, 7},
- 	{131, 9},
- 	{30, 10},
- 	{46, 10},
- 	{7, 9},
- 	{194, 7},
- 	{83, 7},
- 	{54, 10},
- 	{11, 9},
- 	{119, 7},
- 	{19, 9},
- 	{35, 9},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{137, 9},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{58, 10},
- 	{13, 9},
- 	{121, 7},
- 	{21, 9},
- 	{37, 9},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{25, 9},
- 	{41, 9},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{49, 9},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{145, 3},
- 	{205, 9},
- 	{156, 8},
- 	{168, 8},
- 	{146, 4},
- 	{180, 8},
- 	{149, 4},
- 	{161, 4},
- 	{64, 4},
- 	{0, 12},
- 	{159, 8},
- 	{115, 9},
- 	{72, 8},
- 	{133, 9},
- 	{78, 8},
- 	{96, 8},
- 	{65, 5},
- 	{195, 8},
- 	{84, 8},
- 	{102, 8},
- 	{67, 5},
- 	{120, 8},
- 	{73, 5},
- 	{91, 5},
- 	{64, 4},
- 	{0, 12},
- 	{0, 12},
- 	{174, 8},
- 	{148, 6},
- 	{139, 9},
- 	{80, 8},
- 	{98, 8},
- 	{66, 6},
- 	{198, 8},
- 	{86, 8},
- 	{60, 10},
- 	{14, 9},
- 	{122, 8},
- 	{22, 9},
- 	{38, 9},
- 	{3, 8},
- 	{0, 12},
- 	{157, 6},
- 	{110, 8},
- 	{70, 6},
- 	{128, 8},
- 	{26, 9},
- 	{42, 9},
- 	{5, 8},
- 	{193, 6},
- 	{82, 6},
- 	{50, 9},
- 	{9, 8},
- 	{118, 6},
- 	{17, 8},
- 	{33, 8},
- 	{0, 6},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{0, 12},
- 	{189, 8},
- 	{152, 7},
- 	{164, 7},
- 	{145, 3},
- 	{201, 8},
- 	{88, 8},
- 	{106, 8},
- 	{69, 7},
- 	{124, 8},
- 	{75, 7},
- 	{93, 7},
- 	{64, 4},
- 	{0, 12},
- 	{158, 7},
- 	{112, 8},
- 	{71, 7},
- 	{130, 8},
- 	{28, 9},
- 	{44, 9},
- 	{6, 8},
- 	{194, 7},
- 	{83, 7},
- 	{52, 9},
- 	{10, 8},
- 	{119, 7},
- 	{18, 8},
- 	{34, 8},
- 	{1, 7},
- 	{0, 12},
- 	{0, 12},
- 	{173, 7},
- 	{148, 6},
- 	{136, 8},
- 	{79, 7},
- 	{97, 7},
- 	{66, 6},
- 	{197, 7},
- 	{85, 7},
- 	{56, 9},
- 	{12, 8},
- 	{121, 7},
- 	{20, 8},
- 	{36, 8},
- 	{2, 7},
- 	{0, 12},
- 	{157, 6},
- 	{109, 7},
- 	{70, 6},
- 	{127, 7},
- 	{24, 8},
- 	{40, 8},
- 	{4, 7},
- 	{193, 6},
- 	{82, 6},
- 	{48, 8},
- 	{8, 7},
- 	{118, 6},
- 	{16, 7},
- 	{32, 7},
- 	{0, 6}};
+const uint8_t utf8bigindex[4096][2] = { { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 209, 12 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 171, 8 },
+    { 72, 8 },
+    { 183, 8 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 186, 8 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 187, 9 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 77, 7 },
+    { 95, 7 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 192, 11 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 204, 11 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 207, 11 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 117, 11 },
+    { 72, 8 },
+    { 135, 11 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 141, 11 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 143, 11 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 31, 11 },
+    { 47, 11 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 55, 11 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 59, 11 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 61, 11 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 62, 11 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 209, 12 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 208, 12 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 171, 8 },
+    { 72, 8 },
+    { 183, 8 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 186, 8 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 144, 12 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 77, 7 },
+    { 95, 7 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 63, 12 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 148, 6 },
+    { 209, 12 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 209, 12 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 192, 11 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 204, 11 },
+    { 155, 7 },
+    { 167, 7 },
+    { 69, 7 },
+    { 179, 7 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 170, 7 },
+    { 71, 7 },
+    { 182, 7 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 185, 7 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 207, 11 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 117, 11 },
+    { 72, 8 },
+    { 135, 11 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 141, 11 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 104, 8 },
+    { 68, 6 },
+    { 122, 8 },
+    { 74, 6 },
+    { 92, 6 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 76, 6 },
+    { 94, 6 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 77, 7 },
+    { 95, 7 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 143, 11 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 31, 11 },
+    { 47, 11 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 55, 11 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 59, 11 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 61, 11 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 147, 5 },
+    { 209, 12 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 209, 12 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 176, 10 },
+    { 148, 6 },
+    { 188, 10 },
+    { 151, 6 },
+    { 163, 6 },
+    { 66, 6 },
+    { 200, 10 },
+    { 154, 6 },
+    { 166, 6 },
+    { 68, 6 },
+    { 178, 6 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 169, 6 },
+    { 70, 6 },
+    { 181, 6 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 191, 10 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 203, 10 },
+    { 90, 10 },
+    { 108, 10 },
+    { 69, 7 },
+    { 126, 10 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 114, 10 },
+    { 71, 7 },
+    { 132, 10 },
+    { 77, 7 },
+    { 95, 7 },
+    { 65, 5 },
+    { 194, 7 },
+    { 83, 7 },
+    { 101, 7 },
+    { 67, 5 },
+    { 119, 7 },
+    { 73, 5 },
+    { 91, 5 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 138, 10 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 103, 7 },
+    { 68, 6 },
+    { 121, 7 },
+    { 74, 6 },
+    { 92, 6 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 76, 6 },
+    { 94, 6 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 206, 10 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 116, 10 },
+    { 72, 8 },
+    { 134, 10 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 140, 10 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 62, 11 },
+    { 15, 10 },
+    { 122, 8 },
+    { 23, 10 },
+    { 39, 10 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 27, 10 },
+    { 43, 10 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 51, 10 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 29, 10 },
+    { 45, 10 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 53, 10 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 57, 10 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 146, 4 },
+    { 209, 12 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 160, 9 },
+    { 172, 9 },
+    { 147, 5 },
+    { 184, 9 },
+    { 150, 5 },
+    { 162, 5 },
+    { 65, 5 },
+    { 196, 9 },
+    { 153, 5 },
+    { 165, 5 },
+    { 67, 5 },
+    { 177, 5 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 175, 9 },
+    { 148, 6 },
+    { 142, 10 },
+    { 81, 9 },
+    { 99, 9 },
+    { 66, 6 },
+    { 199, 9 },
+    { 87, 9 },
+    { 105, 9 },
+    { 68, 6 },
+    { 123, 9 },
+    { 74, 6 },
+    { 92, 6 },
+    { 64, 4 },
+    { 209, 12 },
+    { 157, 6 },
+    { 111, 9 },
+    { 70, 6 },
+    { 129, 9 },
+    { 76, 6 },
+    { 94, 6 },
+    { 65, 5 },
+    { 193, 6 },
+    { 82, 6 },
+    { 100, 6 },
+    { 67, 5 },
+    { 118, 6 },
+    { 73, 5 },
+    { 91, 5 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 190, 9 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 202, 9 },
+    { 89, 9 },
+    { 107, 9 },
+    { 69, 7 },
+    { 125, 9 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 113, 9 },
+    { 71, 7 },
+    { 131, 9 },
+    { 30, 10 },
+    { 46, 10 },
+    { 7, 9 },
+    { 194, 7 },
+    { 83, 7 },
+    { 54, 10 },
+    { 11, 9 },
+    { 119, 7 },
+    { 19, 9 },
+    { 35, 9 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 137, 9 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 58, 10 },
+    { 13, 9 },
+    { 121, 7 },
+    { 21, 9 },
+    { 37, 9 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 25, 9 },
+    { 41, 9 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 49, 9 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 145, 3 },
+    { 205, 9 },
+    { 156, 8 },
+    { 168, 8 },
+    { 146, 4 },
+    { 180, 8 },
+    { 149, 4 },
+    { 161, 4 },
+    { 64, 4 },
+    { 209, 12 },
+    { 159, 8 },
+    { 115, 9 },
+    { 72, 8 },
+    { 133, 9 },
+    { 78, 8 },
+    { 96, 8 },
+    { 65, 5 },
+    { 195, 8 },
+    { 84, 8 },
+    { 102, 8 },
+    { 67, 5 },
+    { 120, 8 },
+    { 73, 5 },
+    { 91, 5 },
+    { 64, 4 },
+    { 209, 12 },
+    { 209, 12 },
+    { 174, 8 },
+    { 148, 6 },
+    { 139, 9 },
+    { 80, 8 },
+    { 98, 8 },
+    { 66, 6 },
+    { 198, 8 },
+    { 86, 8 },
+    { 60, 10 },
+    { 14, 9 },
+    { 122, 8 },
+    { 22, 9 },
+    { 38, 9 },
+    { 3, 8 },
+    { 209, 12 },
+    { 157, 6 },
+    { 110, 8 },
+    { 70, 6 },
+    { 128, 8 },
+    { 26, 9 },
+    { 42, 9 },
+    { 5, 8 },
+    { 193, 6 },
+    { 82, 6 },
+    { 50, 9 },
+    { 9, 8 },
+    { 118, 6 },
+    { 17, 8 },
+    { 33, 8 },
+    { 0, 6 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 209, 12 },
+    { 189, 8 },
+    { 152, 7 },
+    { 164, 7 },
+    { 145, 3 },
+    { 201, 8 },
+    { 88, 8 },
+    { 106, 8 },
+    { 69, 7 },
+    { 124, 8 },
+    { 75, 7 },
+    { 93, 7 },
+    { 64, 4 },
+    { 209, 12 },
+    { 158, 7 },
+    { 112, 8 },
+    { 71, 7 },
+    { 130, 8 },
+    { 28, 9 },
+    { 44, 9 },
+    { 6, 8 },
+    { 194, 7 },
+    { 83, 7 },
+    { 52, 9 },
+    { 10, 8 },
+    { 119, 7 },
+    { 18, 8 },
+    { 34, 8 },
+    { 1, 7 },
+    { 209, 12 },
+    { 209, 12 },
+    { 173, 7 },
+    { 148, 6 },
+    { 136, 8 },
+    { 79, 7 },
+    { 97, 7 },
+    { 66, 6 },
+    { 197, 7 },
+    { 85, 7 },
+    { 56, 9 },
+    { 12, 8 },
+    { 121, 7 },
+    { 20, 8 },
+    { 36, 8 },
+    { 2, 7 },
+    { 209, 12 },
+    { 157, 6 },
+    { 109, 7 },
+    { 70, 6 },
+    { 127, 7 },
+    { 24, 8 },
+    { 40, 8 },
+    { 4, 7 },
+    { 193, 6 },
+    { 82, 6 },
+    { 48, 8 },
+    { 8, 7 },
+    { 118, 6 },
+    { 16, 7 },
+    { 32, 7 },
+    { 0, 6 } };
 } // utf8_to_utf16 namespace
 } // tables namespace
 } // unnamed namespace
@@ -9397,7 +10689,7 @@ const uint8_t utf8bigindex[4096][2] =
 
 #endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
 /* end file src/tables/utf8_to_utf16_tables.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=tables/utf16_to_utf8_tables.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=tables/utf16_to_utf8_tables.h
 /* begin file src/tables/utf16_to_utf8_tables.h */
 // file generated by scripts/sse_convert_utf16_to_utf8.py
 #ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
@@ -9408,525 +10700,525 @@ namespace {
 namespace tables {
 namespace utf16_to_utf8 {
 
-  // 1 byte for length, 16 bytes for mask
-  const uint8_t pack_1_2_utf8_bytes[256][17] = {
-    {16,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
-    {15,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
-    {15,1,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80},
-    {14,0,3,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
-    {15,1,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80},
-    {14,0,2,5,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {14,1,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,2,5,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80},
-    {14,0,3,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
-    {13,0,2,5,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80},
-    {14,0,3,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {14,1,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,2,4,7,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80},
-    {14,0,3,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
-    {13,0,2,5,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80},
-    {13,0,3,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80},
-    {14,0,3,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {14,1,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,2,5,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80},
-    {13,0,3,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,2,4,6,9,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,8,11,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,9,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,8,10,13,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80},
-    {13,0,3,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,9,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,8,11,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,9,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,8,10,12,15,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {15,1,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80},
-    {14,0,3,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {14,1,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
-    {13,0,2,5,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80},
-    {13,0,3,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,4,7,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80},
-    {13,0,3,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,7,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,7,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,7,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {14,1,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80},
-    {13,0,3,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {13,1,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,2,5,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80},
-    {12,0,3,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,4,6,9,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,8,11,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,9,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,8,10,13,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {13,1,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80},
-    {12,0,3,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,2,5,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,5,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,5,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {12,1,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80},
-    {11,0,3,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,2,4,6,9,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,8,11,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,1,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80},
-    {10,0,3,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,3,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,1,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,2,4,6,9,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,1,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,2,4,6,8,10,12,14,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
-  };
-
-  // 1 byte for length, 16 bytes for mask
-  const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
-    {12,2,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80},
-    {9,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,3,1,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
-    {10,0,6,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,2,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
-    {8,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,3,1,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,7,5,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,4,10,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,2,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80},
-    {8,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,3,1,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,6,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,7,5,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,4,11,9,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,7,5,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,4,8,14,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,10,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,6,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,3,1,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,7,5,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,2,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,3,1,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,0,4,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,6,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,7,5,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,4,11,9,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,6,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,2,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,3,1,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,0,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,7,5,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,4,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {11,2,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80},
-    {8,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,3,1,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,0,6,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,7,5,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,4,10,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,6,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,7,5,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,4,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,7,5,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,4,11,9,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,8,15,13,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {10,2,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,3,1,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,0,6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,7,5,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,4,10,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,6,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,2,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,3,1,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,0,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,7,5,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,4,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {9,2,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,3,1,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,0,6,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,7,5,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,4,11,9,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {8,2,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,3,1,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,0,6,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,2,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {2,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,3,1,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,0,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {7,2,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,3,1,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,0,7,5,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {6,2,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {3,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {5,3,1,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80},
-    {4,0,4,8,12,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}
-  };
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_utf8_bytes[256][17] = {
+    { 16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 },
+    { 15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 15, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 8, 11, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 9, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 8, 10, 13, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 9, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 8, 11, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 9, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80 },
+    { 14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80 },
+    { 13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80 },
+    { 12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }
+};
+
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+    { 12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 },
+    { 4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }
+};
 
 } // utf16_to_utf8 namespace
 } // tables namespace
@@ -9938,7 +11230,7 @@ namespace utf16_to_utf8 {
 // End of tables.
 
 // The scalar routines should be included once.
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/ascii.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/ascii.h
 /* begin file src/scalar/ascii.h */
 #ifndef SIMDUTF_ASCII_H
 #define SIMDUTF_ASCII_H
@@ -9949,45 +11241,55 @@ namespace {
 namespace ascii {
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 // Only used by the fallback kernel.
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+inline simdutf_warn_unused bool validate(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
     uint64_t pos = 0;
     // process in blocks of 16 bytes when possible
-    for (;pos + 16 < len; pos += 16) {
+    for (; pos + 16 < len; pos += 16) {
         uint64_t v1;
         std::memcpy(&v1, data + pos, sizeof(uint64_t));
         uint64_t v2;
         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-        uint64_t v{v1 | v2};
-        if ((v & 0x8080808080808080) != 0) { return false; }
+        uint64_t v { v1 | v2 };
+        if ((v & 0x8080808080808080) != 0) {
+            return false;
+        }
     }
     // process the tail byte-by-byte
-    for (;pos < len; pos ++) {
-        if (data[pos] >= 0b10000000) { return false; }
+    for (; pos < len; pos++) {
+        if (data[pos] >= 0b10000000) {
+            return false;
+        }
     }
     return true;
 }
 #endif
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
-    const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+inline simdutf_warn_unused result validate_with_errors(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
     size_t pos = 0;
     // process in blocks of 16 bytes when possible
-    for (;pos + 16 < len; pos += 16) {
+    for (; pos + 16 < len; pos += 16) {
         uint64_t v1;
         std::memcpy(&v1, data + pos, sizeof(uint64_t));
         uint64_t v2;
         std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-        uint64_t v{v1 | v2};
+        uint64_t v { v1 | v2 };
         if ((v & 0x8080808080808080) != 0) {
-            for (;pos < len; pos ++) {
-                if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
+            for (; pos < len; pos++) {
+                if (data[pos] >= 0b10000000) {
+                    return result(error_code::TOO_LARGE, pos);
+                }
             }
         }
     }
     // process the tail byte-by-byte
-    for (;pos < len; pos ++) {
-        if (data[pos] >= 0b10000000) { return result(error_code::TOO_LARGE, pos); }
+    for (; pos < len; pos++) {
+        if (data[pos] >= 0b10000000) {
+            return result(error_code::TOO_LARGE, pos);
+        }
     }
     return result(error_code::SUCCESS, pos);
 }
@@ -9999,7 +11301,7 @@ inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t l
 
 #endif
 /* end file src/scalar/ascii.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8.h
 /* begin file src/scalar/utf8.h */
 #ifndef SIMDUTF_UTF8_H
 #define SIMDUTF_UTF8_H
@@ -10011,177 +11313,249 @@ namespace utf8 {
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
 // only used by the fallback kernel.
 // credit: based on code from Google Fuchsia (Apache Licensed)
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  uint64_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 8 bytes are ascii.
-    uint64_t next_pos = pos + 16;
-    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
+inline simdutf_warn_unused bool validate(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    uint64_t pos = 0;
+    uint32_t code_point = 0;
+    while (pos < len) {
+        // check of the next 8 bytes are ascii.
+        uint64_t next_pos = pos + 16;
+        if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v1;
+            std::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                pos = next_pos;
+                continue;
+            }
+        }
+        unsigned char byte = data[pos];
+
+        while (byte < 0b10000000) {
+            if (++pos == len) {
+                return true;
+            }
+            byte = data[pos];
+        }
+
+        if ((byte & 0b11100000) == 0b11000000) {
+            next_pos = pos + 2;
+            if (next_pos > len) {
+                return false;
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            // range check
+            code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if ((code_point < 0x80) || (0x7ff < code_point)) {
+                return false;
+            }
+        } else if ((byte & 0b11110000) == 0b11100000) {
+            next_pos = pos + 3;
+            if (next_pos > len) {
+                return false;
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            // range check
+            code_point = (byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if ((code_point < 0x800) || (0xffff < code_point) || (0xd7ff < code_point && code_point < 0xe000)) {
+                return false;
+            }
+        } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+            next_pos = pos + 4;
+            if (next_pos > len) {
+                return false;
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return false;
+            }
+            // range check
+            code_point = (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff || 0x10ffff < code_point) {
+                return false;
+            }
+        } else {
+            // we may have a continuation
+            return false;
+        }
         pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
-
-    while (byte < 0b10000000) {
-      if (++pos == len) { return true; }
-      byte = data[pos];
-    }
-
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) { return false; }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) { return false; }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) { return false; }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point) ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return false;
-      }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) { return false; }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return false; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return false; }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return false; }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) { return false; }
-    } else {
-      // we may have a continuation
-      return false;
     }
-    pos = next_pos;
-  }
-  return true;
+    return true;
 }
 #endif
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 8 bytes are ascii.
-    size_t next_pos = pos + 16;
-    if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
+inline simdutf_warn_unused result validate_with_errors(const char* buf, size_t len) noexcept
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    uint32_t code_point = 0;
+    while (pos < len) {
+        // check of the next 8 bytes are ascii.
+        size_t next_pos = pos + 16;
+        if (next_pos <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v1;
+            std::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                pos = next_pos;
+                continue;
+            }
+        }
+        unsigned char byte = data[pos];
+
+        while (byte < 0b10000000) {
+            if (++pos == len) {
+                return result(error_code::SUCCESS, len);
+            }
+            byte = data[pos];
+        }
+
+        if ((byte & 0b11100000) == 0b11000000) {
+            next_pos = pos + 2;
+            if (next_pos > len) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if ((code_point < 0x80) || (0x7ff < code_point)) {
+                return result(error_code::OVERLONG, pos);
+            }
+        } else if ((byte & 0b11110000) == 0b11100000) {
+            next_pos = pos + 3;
+            if (next_pos > len) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            code_point = (byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if ((code_point < 0x800) || (0xffff < code_point)) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xd7ff < code_point && code_point < 0xe000) {
+                return result(error_code::SURROGATE, pos);
+            }
+        } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+            next_pos = pos + 4;
+            if (next_pos > len) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            code_point = (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0x10ffff < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            } else {
+                return result(error_code::HEADER_BITS, pos);
+            }
+        }
         pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
-
-    while (byte < 0b10000000) {
-      if (++pos == len) { return result(error_code::SUCCESS, len); }
-      byte = data[pos];
-    }
-
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) { return result(error_code::OVERLONG, pos); }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
-      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
-      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
-      else { return result(error_code::HEADER_BITS, pos); }
     }
-    pos = next_pos;
-  }
-  return result(error_code::SUCCESS, len);
+    return result(error_code::SUCCESS, len);
 }
 
 // Finds the previous leading byte and validates with errors from there
 // Used to pinpoint the location of an error when an invalid chunk is detected
-inline simdutf_warn_unused result rewind_and_validate_with_errors(const char *buf, size_t len) noexcept {
-  size_t extra_len{0};
-  // A leading byte cannot be further than 4 bytes away
-  for(int i = 0; i < 5; i++) {
-    unsigned char byte = *buf;
-    if ((byte & 0b11000000) != 0b10000000) {
-      break;
-    } else {
-      buf--;
-      extra_len++;
+inline simdutf_warn_unused result rewind_and_validate_with_errors(const char* buf, size_t len) noexcept
+{
+    size_t extra_len { 0 };
+    // A leading byte cannot be further than 4 bytes away
+    for (int i = 0; i < 5; i++) {
+        unsigned char byte = *buf;
+        if ((byte & 0b11000000) != 0b10000000) {
+            break;
+        } else {
+            buf--;
+            extra_len++;
+        }
     }
-  }
 
-  result res = validate_with_errors(buf, len + extra_len);
-  res.count -= extra_len;
-  return res;
+    result res = validate_with_errors(buf, len + extra_len);
+    res.count -= extra_len;
+    return res;
 }
 
-inline size_t count_code_points(const char* buf, size_t len) {
-    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
-    size_t counter{0};
-    for(size_t i = 0; i < len; i++) {
+inline size_t count_code_points(const char* buf, size_t len)
+{
+    const int8_t* p = reinterpret_cast<const int8_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
         // -65 is 0b10111111, anything larger in two-complement's should start a new code point.
-        if(p[i] > -65) { counter++; }
+        if (p[i] > -65) {
+            counter++;
+        }
     }
     return counter;
 }
 
-inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
-    const int8_t * p = reinterpret_cast<const int8_t *>(buf);
-    size_t counter{0};
-    for(size_t i = 0; i < len; i++) {
-        if(p[i] > -65) { counter++; }
-        if(uint8_t(p[i]) >= 240) { counter++; }
+inline size_t utf16_length_from_utf8(const char* buf, size_t len)
+{
+    const int8_t* p = reinterpret_cast<const int8_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        if (p[i] > -65) {
+            counter++;
+        }
+        if (uint8_t(p[i]) >= 240) {
+            counter++;
+        }
     }
     return counter;
 }
 
+inline size_t latin1_length_from_utf8(const char* buf, size_t len)
+{
+    const uint8_t* c = reinterpret_cast<const uint8_t*>(buf);
+
+    size_t answer = len;
+    for (size_t i = 0; i < len; i++) {
+        if ((c[i] & 0b11100000) == 0b11000000) {
+            answer--;
+        } // if we have a two-byte UTF8 character
+    }
+    return answer;
+}
+
 } // utf8 namespace
 } // unnamed namespace
 } // namespace scalar
@@ -10189,7 +11563,7 @@ inline size_t utf16_length_from_utf8(const char* buf, size_t len) {
 
 #endif
 /* end file src/scalar/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16.h
 /* begin file src/scalar/utf16.h */
 #ifndef SIMDUTF_UTF16_H
 #define SIMDUTF_UTF16_H
@@ -10199,101 +11573,133 @@ namespace scalar {
 namespace {
 namespace utf16 {
 
-inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
-  return uint16_t((word >> 8) | (word << 8));
-}
-
-template <endianness big_endian>
-inline simdutf_warn_unused bool validate(const char16_t *buf, size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  uint64_t pos = 0;
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800) == 0xD800) {
-        if(pos + 1 >= len) { return false; }
-        uint16_t diff = uint16_t(word - 0xD800);
-        if(diff > 0x3FF) { return false; }
-        uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-        uint16_t diff2 = uint16_t(next_word - 0xDC00);
-        if(diff2 > 0x3FF) { return false; }
-        pos += 2;
-    } else {
-        pos++;
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word)
+{
+    return uint16_t((word >> 8) | (word << 8));
+}
+
+template<endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t* buf, size_t len) noexcept
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    uint64_t pos = 0;
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) == 0xD800) {
+            if (pos + 1 >= len) {
+                return false;
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return false;
+            }
+            uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return false;
+            }
+            pos += 2;
+        } else {
+            pos++;
+        }
     }
-  }
-  return true;
-}
-
-template <endianness big_endian>
-inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if((word & 0xF800) == 0xD800) {
-        if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
-        uint16_t diff = uint16_t(word - 0xD800);
-        if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
-        uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-        uint16_t diff2 = uint16_t(next_word - 0xDC00);
-        if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
-        pos += 2;
-    } else {
-        pos++;
+    return true;
+}
+
+template<endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t* buf, size_t len) noexcept
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) == 0xD800) {
+            if (pos + 1 >= len) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t next_word = !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            pos += 2;
+        } else {
+            pos++;
+        }
+    }
+    return result(error_code::SUCCESS, pos);
+}
+
+template<endianness big_endian>
+inline size_t count_code_points(const char16_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+        counter += ((word & 0xFC00) != 0xDC00);
+    }
+    return counter;
+}
+
+template<endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+        /** ASCII **/
+        if (word <= 0x7F) {
+            counter++;
+        }
+        /** two-byte **/
+        else if (word <= 0x7FF) {
+            counter += 2;
+        }
+        /** three-byte **/
+        else if ((word <= 0xD7FF) || (word >= 0xE000)) {
+            counter += 3;
+        }
+        /** surrogates -- 4 bytes **/
+        else {
+            counter += 2;
+        }
+    }
+    return counter;
+}
+
+template<endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint16_t* p = reinterpret_cast<const uint16_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+        counter += ((word & 0xFC00) != 0xDC00);
+    }
+    return counter;
+}
+
+inline size_t latin1_length_from_utf16(size_t len)
+{
+    return len;
+}
+
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out)
+{
+    const uint16_t* input = reinterpret_cast<const uint16_t*>(in);
+    uint16_t* output = reinterpret_cast<uint16_t*>(out);
+    for (size_t i = 0; i < size; i++) {
+        *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
     }
-  }
-  return result(error_code::SUCCESS, pos);
-}
-
-template <endianness big_endian>
-inline size_t count_code_points(const char16_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
-  }
-  return counter;
-}
-
-template <endianness big_endian>
-inline size_t utf8_length_from_utf16(const char16_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    /** ASCII **/
-    if(word <= 0x7F) { counter++; }
-    /** two-byte **/
-    else if (word <= 0x7FF) { counter += 2; }
-    /** three-byte **/
-    else if((word <= 0xD7FF) || (word >= 0xE000)) { counter += 3; }
-    /** surrogates -- 4 bytes **/
-    else { counter += 2; }
-  }
-  return counter;
-}
-
-template <endianness big_endian>
-inline size_t utf32_length_from_utf16(const char16_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t * p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
-  }
-  return counter;
-}
-
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* out) {
-  const uint16_t * input = reinterpret_cast<const uint16_t *>(in);
-  uint16_t * output = reinterpret_cast<uint16_t *>(out);
-  for (size_t i = 0; i < size; i++) {
-    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
-  }
 }
 
 } // utf16 namespace
@@ -10303,7 +11709,7 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 
 #endif
 /* end file src/scalar/utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32.h
 /* begin file src/scalar/utf32.h */
 #ifndef SIMDUTF_UTF32_H
 #define SIMDUTF_UTF32_H
@@ -10313,61 +11719,83 @@ namespace scalar {
 namespace {
 namespace utf32 {
 
-inline simdutf_warn_unused bool validate(const char32_t *buf, size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  uint64_t pos = 0;
-  for(;pos < len; pos++) {
-    uint32_t word = data[pos];
-    if(word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
-        return false;
+inline simdutf_warn_unused bool validate(const char32_t* buf, size_t len) noexcept
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    uint64_t pos = 0;
+    for (; pos < len; pos++) {
+        uint32_t word = data[pos];
+        if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline simdutf_warn_unused result validate_with_errors(const char32_t* buf, size_t len) noexcept
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    for (; pos < len; pos++) {
+        uint32_t word = data[pos];
+        if (word > 0x10FFFF) {
+            return result(error_code::TOO_LARGE, pos);
+        }
+        if (word >= 0xD800 && word <= 0xDFFF) {
+            return result(error_code::SURROGATE, pos);
+        }
     }
-  }
-  return true;
-}
-
-inline simdutf_warn_unused result validate_with_errors(const char32_t *buf, size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  for(;pos < len; pos++) {
-    uint32_t word = data[pos];
-    if(word > 0x10FFFF) {
-        return result(error_code::TOO_LARGE, pos);
-    }
-    if(word >= 0xD800 && word <= 0xDFFF) {
-        return result(error_code::SURROGATE, pos);
-    }
-  }
-  return result(error_code::SUCCESS, pos);
-}
-
-inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    /** ASCII **/
-    if(p[i] <= 0x7F) { counter++; }
-    /** two-byte **/
-    else if(p[i] <= 0x7FF) { counter += 2; }
-    /** three-byte **/
-    else if(p[i] <= 0xFFFF) { counter += 3; }
-    /** four-bytes **/
-    else { counter += 4; }
-  }
-  return counter;
-}
-
-inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t * p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for(size_t i = 0; i < len; i++) {
-    /** non-surrogate word **/
-    if(p[i] <= 0xFFFF) { counter++; }
-    /** surrogate pair **/
-    else { counter += 2; }
-  }
-  return counter;
+    return result(error_code::SUCCESS, pos);
+}
+
+inline size_t utf8_length_from_utf32(const char32_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        /** ASCII **/
+        if (p[i] <= 0x7F) {
+            counter++;
+        }
+        /** two-byte **/
+        else if (p[i] <= 0x7FF) {
+            counter += 2;
+        }
+        /** three-byte **/
+        else if (p[i] <= 0xFFFF) {
+            counter += 3;
+        }
+        /** four-bytes **/
+        else {
+            counter += 4;
+        }
+    }
+    return counter;
+}
+
+inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len)
+{
+    // We are not BOM aware.
+    const uint32_t* p = reinterpret_cast<const uint32_t*>(buf);
+    size_t counter { 0 };
+    for (size_t i = 0; i < len; i++) {
+        /** non-surrogate word **/
+        if (p[i] <= 0xFFFF) {
+            counter++;
+        }
+        /** surrogate pair **/
+        else {
+            counter += 2;
+        }
+    }
+    return counter;
+}
+
+inline size_t latin1_length_from_utf32(size_t len)
+{
+    // We are not BOM aware.
+    return len; // a utf32 codepoint will always represent 1 latin1 character
 }
 
 } // utf32 namespace
@@ -10377,8 +11805,48 @@ inline size_t utf16_length_from_utf32(const char32_t* buf, size_t len) {
 
 #endif
 /* end file src/scalar/utf32.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1.h
+/* begin file src/scalar/latin1.h */
+#ifndef SIMDUTF_LATIN1_H
+#define SIMDUTF_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1 {
+
+inline size_t utf32_length_from_latin1(size_t len)
+{
+    // We are not BOM aware.
+    return len; // a utf32 unit will always represent 1 latin1 character
+}
+
+inline size_t utf8_length_from_latin1(const char* buf, size_t len)
+{
+    const uint8_t* c = reinterpret_cast<const uint8_t*>(buf);
+    size_t answer = 0;
+    for (size_t i = 0; i < len; i++) {
+        if ((c[i] >> 7)) {
+            answer++;
+        }
+    }
+    return answer + len;
+}
+
+inline size_t utf16_length_from_latin1(size_t len)
+{
+    return len;
+}
+
+} // utf32 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h
+#endif
+/* end file src/scalar/latin1.h */
+
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf8/valid_utf32_to_utf8.h
 /* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
 #ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
 #define SIMDUTF_VALID_UTF32_TO_UTF8_H
@@ -10390,51 +11858,52 @@ namespace utf32_to_utf8 {
 
 #if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 // only used by the fallback and POWER kernel
-inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output) {
-	const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
+inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 2 ASCII characters
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+                *utf8_output++ = char(buf[pos]);
+                *utf8_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t word = data[pos];
+        if ((word & 0xFFFFFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xFFFFF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xFFFF0000) == 0) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 18) | 0b11110000);
+            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        }
     }
-  }
-  return utf8_output - start;
+    return utf8_output - start;
 }
 #endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 
@@ -10445,7 +11914,7 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char* utf8_output)
 
 #endif
 /* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf8/utf32_to_utf8.h
 /* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
 #ifndef SIMDUTF_UTF32_TO_UTF8_H
 #define SIMDUTF_UTF32_TO_UTF8_H
@@ -10455,102 +11924,112 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf8 {
 
-inline size_t convert(const char32_t* buf, size_t len, char* utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-			if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-			if (word > 0x10FFFF) { return 0; }
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
-    }
-  }
-  return utf8_output - start;
-}
-
-inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-				*utf8_output++ = char(buf[pos+1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if((word & 0xFFFFFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xFFFFF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word & 0xFFFF0000)==0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-			if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-			if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
-      *utf8_output++ = char((word>>18) | 0b11110000);
-      *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos ++;
+inline size_t convert(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 2 ASCII characters
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+                *utf8_output++ = char(buf[pos]);
+                *utf8_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t word = data[pos];
+        if ((word & 0xFFFFFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xFFFFF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xFFFF0000) == 0) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return 0;
+            }
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            if (word > 0x10FFFF) {
+                return 0;
+            }
+            *utf8_output++ = char((word >> 18) | 0b11110000);
+            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        }
+    }
+    return utf8_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 2 ASCII characters
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+                *utf8_output++ = char(buf[pos]);
+                *utf8_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t word = data[pos];
+        if ((word & 0xFFFFFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xFFFFF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xFFFF0000) == 0) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            if (word > 0x10FFFF) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            *utf8_output++ = char((word >> 18) | 0b11110000);
+            *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf8_output - start);
+    return result(error_code::SUCCESS, utf8_output - start);
 }
 
 } // utf32_to_utf8 namespace
@@ -10561,7 +12040,7 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char* utf8_ou
 #endif
 /* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf16/valid_utf32_to_utf16.h
 /* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
 #ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
 #define SIMDUTF_VALID_UTF32_TO_UTF16_H
@@ -10571,32 +12050,33 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if((word & 0xFFFF0000)==0) {
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-      pos++;
-    } else {
-      // will generate a surrogate pair
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos++;
-    }
-  }
-  return utf16_output - start;
+template<endianness big_endian>
+inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        uint32_t word = data[pos];
+        if ((word & 0xFFFF0000) == 0) {
+            // will not generate a surrogate pair
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+            pos++;
+        } else {
+            // will generate a surrogate pair
+            word -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos++;
+        }
+    }
+    return utf16_output - start;
 }
 
 } // utf32_to_utf16 namespace
@@ -10606,7 +12086,7 @@ inline size_t convert_valid(const char32_t* buf, size_t len, char16_t* utf16_out
 
 #endif
 /* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_utf16/utf32_to_utf16.h
 /* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
 #ifndef SIMDUTF_UTF32_TO_UTF16_H
 #define SIMDUTF_UTF32_TO_UTF16_H
@@ -10616,62 +12096,72 @@ namespace scalar {
 namespace {
 namespace utf32_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if((word & 0xFFFF0000)==0) {
-      if (word >= 0xD800 && word <= 0xDFFF) { return 0; }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-    } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) { return 0; }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-    }
-    pos++;
-  }
-  return utf16_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if((word & 0xFFFF0000)==0) {
-      if (word >= 0xD800 && word <= 0xDFFF) { return result(error_code::SURROGATE, pos); }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
-    } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) { return result(error_code::TOO_LARGE, pos); }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-    }
-    pos++;
-  }
-  return result(error_code::SUCCESS, utf16_output - start);
+template<endianness big_endian>
+inline size_t convert(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        uint32_t word = data[pos];
+        if ((word & 0xFFFF0000) == 0) {
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return 0;
+            }
+            // will not generate a surrogate pair
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+        } else {
+            // will generate a surrogate pair
+            if (word > 0x10FFFF) {
+                return 0;
+            }
+            word -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+        }
+        pos++;
+    }
+    return utf16_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        uint32_t word = data[pos];
+        if ((word & 0xFFFF0000) == 0) {
+            if (word >= 0xD800 && word <= 0xDFFF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            // will not generate a surrogate pair
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(uint16_t(word))) : char16_t(word);
+        } else {
+            // will generate a surrogate pair
+            if (word > 0x10FFFF) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            word -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+        }
+        pos++;
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
 }
 
 } // utf32_to_utf16 namespace
@@ -10682,7 +12172,7 @@ inline result convert_with_errors(const char32_t* buf, size_t len, char16_t* utf
 #endif
 /* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf8/valid_utf16_to_utf8.h
 /* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
 #ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
 #define SIMDUTF_VALID_UTF16_TO_UTF8_H
@@ -10692,62 +12182,67 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf8 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 4 ASCII characters
-    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while(pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word & 0xFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word &0xF800 ) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value>>18) | 0b11110000);
-      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return utf8_output - start;
+template<endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 4 ASCII characters
+        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if (!match_system(big_endian)) {
+                v = (v >> 8) | (v << (64 - 8));
+            }
+            if ((v & 0xFF80FF80FF80FF80) == 0) {
+                size_t final_pos = pos + 4;
+                while (pos < final_pos) {
+                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xF800) != 0xD800) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((value >> 18) | 0b11110000);
+            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            pos += 2;
+        }
+    }
+    return utf8_output - start;
 }
 
 } // utf16_to_utf8 namespace
@@ -10757,7 +12252,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char* utf8_output)
 
 #endif
 /* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf8/utf16_to_utf8.h
 /* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
 #ifndef SIMDUTF_UTF16_TO_UTF8_H
 #define SIMDUTF_UTF16_TO_UTF8_H
@@ -10767,122 +12262,139 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf8 {
 
-template <endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char* utf8_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII characters
-    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while(pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word & 0xFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word &0xF800 ) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if(pos + 1 >= len) { return 0; }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return 0; }
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return 0; }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value>>18) | 0b11110000);
-      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return utf8_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char* start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII characters
-    if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while(pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word & 0xFF80)==0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if((word & 0xF800)==0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if((word &0xF800 ) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word>>12) | 0b11100000);
-      *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value>>18) | 0b11110000);
-      *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return result(error_code::SUCCESS, utf8_output - start);
+template<endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII characters
+        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if (!match_system(big_endian)) {
+                v = (v >> 8) | (v << (64 - 8));
+            }
+            if ((v & 0xFF80FF80FF80FF80) == 0) {
+                size_t final_pos = pos + 4;
+                while (pos < final_pos) {
+                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xF800) != 0xD800) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            if (pos + 1 >= len) {
+                return 0;
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return 0;
+            }
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return 0;
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((value >> 18) | 0b11110000);
+            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            pos += 2;
+        }
+    }
+    return utf8_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII characters
+        if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if (!match_system(big_endian))
+                v = (v >> 8) | (v << (64 - 8));
+            if ((v & 0xFF80FF80FF80FF80) == 0) {
+                size_t final_pos = pos + 4;
+                while (pos < final_pos) {
+                    *utf8_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(buf[pos])) : char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF80) == 0) {
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(word);
+            pos++;
+        } else if ((word & 0xF800) == 0) {
+            // will generate two UTF-8 bytes
+            // we have 0b110XXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 6) | 0b11000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else if ((word & 0xF800) != 0xD800) {
+            // will generate three UTF-8 bytes
+            // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((word >> 12) | 0b11100000);
+            *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            if (pos + 1 >= len) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            // will generate four UTF-8 bytes
+            // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+            *utf8_output++ = char((value >> 18) | 0b11110000);
+            *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+            *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+            *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            pos += 2;
+        }
+    }
+    return result(error_code::SUCCESS, utf8_output - start);
 }
 
 } // utf16_to_utf8 namespace
@@ -10893,7 +12405,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* utf8_ou
 #endif
 /* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf32/valid_utf16_to_utf32.h
 /* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
 #ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
 #define SIMDUTF_VALID_UTF16_TO_UTF32_H
@@ -10903,29 +12415,32 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf32 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800 ) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+template<endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) != 0xD800) {
+            // No surrogate pair, extend 16-bit word to 32-bit word
+            *utf32_output++ = char32_t(word);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            *utf32_output++ = char32_t(value);
+            pos += 2;
+        }
     }
-  }
-  return utf32_output - start;
+    return utf32_output - start;
 }
 
 } // utf16_to_utf32 namespace
@@ -10935,7 +12450,7 @@ inline size_t convert_valid(const char16_t* buf, size_t len, char32_t* utf32_out
 
 #endif
 /* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_utf32/utf16_to_utf32.h
 /* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
 #ifndef SIMDUTF_UTF16_TO_UTF32_H
 #define SIMDUTF_UTF16_TO_UTF32_H
@@ -10945,58 +12460,72 @@ namespace scalar {
 namespace {
 namespace utf16_to_utf32 {
 
-template <endianness big_endian>
-inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800 ) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return 0; }
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return 0; }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
-    }
-  }
-  return utf32_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if((word &0xF800 ) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if(diff > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      if(pos + 1 >= len) { return result(error_code::SURROGATE, pos); } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if(diff2 > 0x3FF) { return result(error_code::SURROGATE, pos); }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+template<endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) != 0xD800) {
+            // No surrogate pair, extend 16-bit word to 32-bit word
+            *utf32_output++ = char32_t(word);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return 0;
+            }
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return 0;
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            *utf32_output++ = char32_t(value);
+            pos += 2;
+        }
+    }
+    return utf32_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        uint16_t word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xF800) != 0xD800) {
+            // No surrogate pair, extend 16-bit word to 32-bit word
+            *utf32_output++ = char32_t(word);
+            pos++;
+        } else {
+            // must be a surrogate pair
+            uint16_t diff = uint16_t(word - 0xD800);
+            if (diff > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            if (pos + 1 >= len) {
+                return result(error_code::SURROGATE, pos);
+            } // minimal bound checking
+            uint16_t next_word = !match_system(big_endian) ? utf16::swap_bytes(data[pos + 1]) : data[pos + 1];
+            uint16_t diff2 = uint16_t(next_word - 0xDC00);
+            if (diff2 > 0x3FF) {
+                return result(error_code::SURROGATE, pos);
+            }
+            uint32_t value = (diff << 10) + diff2 + 0x10000;
+            *utf32_output++ = char32_t(value);
+            pos += 2;
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf32_output - start);
+    return result(error_code::SUCCESS, utf32_output - start);
 }
 
 } // utf16_to_utf32 namespace
@@ -11007,7 +12536,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char32_t* utf
 #endif
 /* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
 #ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
 #define SIMDUTF_VALID_UTF8_TO_UTF16_H
@@ -11017,74 +12546,80 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while(pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 1 >= len) { break; } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 2 >= len) { break; } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { break; } // minimal bound checking
-      uint32_t code_point = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
-                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
+template<endianness big_endian>
+inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII bytes
+        if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 8;
+                while (pos < final_pos) {
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 1 >= len) {
+                break;
+            } // minimal bound checking
+            uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) | (data[pos + 1] & 0b00111111));
+            if (!match_system(big_endian)) {
+                code_point = utf16::swap_bytes(uint16_t(code_point));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 2 >= len) {
+                break;
+            } // minimal bound checking
+            uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) | ((data[pos + 1] & 0b00111111) << 6) | (data[pos + 2] & 0b00111111));
+            if (!match_system(big_endian)) {
+                code_point = utf16::swap_bytes(uint16_t(code_point));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                break;
+            } // minimal bound checking
+            uint32_t code_point = ((leading_byte & 0b00000111) << 18) | ((data[pos + 1] & 0b00111111) << 12)
+                | ((data[pos + 2] & 0b00111111) << 6) | (data[pos + 3] & 0b00111111);
+            code_point -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos += 4;
+        } else {
+            // we may have a continuation but we do not do error checking
+            return 0;
+        }
     }
-  }
-  return utf16_output - start;
+    return utf16_output - start;
 }
 
-
 } // namespace utf8_to_utf16
 } // unnamed namespace
 } // namespace scalar
@@ -11092,7 +12627,7 @@ inline size_t convert_valid(const char* buf, size_t len, char16_t* utf16_output)
 
 #endif
 /* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
 #ifndef SIMDUTF_UTF8_TO_UTF16_H
 #define SIMDUTF_UTF8_TO_UTF16_H
@@ -11102,184 +12637,230 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf16 {
 
-template <endianness big_endian>
-inline size_t convert(const char* buf, size_t len, char16_t* utf16_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 2 >= len) { return 0; } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      return 0;
-    }
-  }
-  return utf16_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)): char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) { return result(error_code::OVERLONG, pos);}
-      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
-      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
-      else { return result(error_code::HEADER_BITS, pos); }
+template<endianness big_endian>
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return 0;
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 2 >= len) {
+                return 0;
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if (code_point < 0x800 || 0xffff < code_point || (0xd7ff < code_point && code_point < 0xe000)) {
+                return 0;
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff || 0x10ffff < code_point) {
+                return 0;
+            }
+            code_point -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos += 4;
+        } else {
+            return 0;
+        }
+    }
+    return utf16_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(buf[pos])) : char16_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf16_output++ = !match_system(big_endian) ? char16_t(utf16::swap_bytes(leading_byte)) : char16_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 1 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8, it should become
+            // a single UTF-16 word.
+            if (pos + 2 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if ((code_point < 0x800) || (0xffff < code_point)) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xd7ff < code_point && code_point < 0xe000) {
+                return result(error_code::SURROGATE, pos);
+            }
+            if (!match_system(big_endian)) {
+                code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+            }
+            *utf16_output++ = char16_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0x10ffff < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            code_point -= 0x10000;
+            uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+            uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+            if (!match_system(big_endian)) {
+                high_surrogate = utf16::swap_bytes(high_surrogate);
+                low_surrogate = utf16::swap_bytes(low_surrogate);
+            }
+            *utf16_output++ = char16_t(high_surrogate);
+            *utf16_output++ = char16_t(low_surrogate);
+            pos += 4;
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            } else {
+                return result(error_code::HEADER_BITS, pos);
+            }
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf16_output - start);
+    return result(error_code::SUCCESS, utf16_output - start);
 }
 
 /**
@@ -11295,41 +12876,44 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o
  * If the error is believed to have occured prior to 'buf', the count value contain in the result
  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
  */
-template <endianness endian>
-inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  size_t how_far_back = 3; // 3 bytes in the past + current position
-  if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-i];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if(found_leading_bytes) {
-      buf -= i;
-      extra_len = i;
-      break;
-    }
-  }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
-  // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if(!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is continuation]
-    // Or we possibly have a stream that does not start with a leading byte.
-    return result(error_code::TOO_LONG, -how_far_back);
-  }
-  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
-  if (res.error) {
-    res.count -= extra_len;
-  }
-  return res;
+template<endianness endian>
+inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char16_t* utf16_output)
+{
+    size_t extra_len { 0 };
+    // We potentially need to go back in time and find a leading byte.
+    // In theory '3' would be sufficient, but sometimes the error can go back quite far.
+    size_t how_far_back = prior_bytes;
+    // size_t how_far_back = 3; // 3 bytes in the past + current position
+    // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+    bool found_leading_bytes { false };
+    // important: it is i <= how_far_back and not 'i < how_far_back'.
+    for (size_t i = 0; i <= how_far_back; i++) {
+        unsigned char byte = buf[0 - i];
+        found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+        if (found_leading_bytes) {
+            buf -= i;
+            extra_len = i;
+            break;
+        }
+    }
+    //
+    // It is possible for this function to return a negative count in its result.
+    // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
+    // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
+    //
+    // An unsigned type will simply wrap round arithmetically (well defined).
+    //
+    if (!found_leading_bytes) {
+        // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+        // [....] [continuation] [continuation] [continuation] | [buf is continuation]
+        // Or we possibly have a stream that does not start with a leading byte.
+        return result(error_code::TOO_LONG, 0 - how_far_back);
+    }
+    result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
+    if (res.error) {
+        res.count -= extra_len;
+    }
+    return res;
 }
 
 } // utf8_to_utf16 namespace
@@ -11340,7 +12924,7 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
 #endif
 /* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
 #ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
 #define SIMDUTF_VALID_UTF8_TO_UTF32_H
@@ -11350,55 +12934,61 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf32 {
 
-inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while(pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if(pos + 1 >= len) { break; } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte &0b00011111) << 6) | (data[pos + 1] &0b00111111));
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if(pos + 2 >= len) { break; } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte &0b00001111) << 12) | ((data[pos + 1] &0b00111111) << 6) | (data[pos + 2] &0b00111111));
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { break; } // minimal bound checking
-      uint32_t code_word = ((leading_byte & 0b00000111) << 18 )| ((data[pos + 1] &0b00111111) << 12)
-                           | ((data[pos + 2] &0b00111111) << 6) | (data[pos + 3] &0b00111111);
-      *utf32_output++ = char32_t(code_word);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
+inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        // try to convert the next block of 8 ASCII bytes
+        if (pos + 8 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 8;
+                while (pos < final_pos) {
+                    *utf32_output++ = char32_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf32_output++ = char32_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                break;
+            } // minimal bound checking
+            *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) | (data[pos + 1] & 0b00111111));
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            if (pos + 2 >= len) {
+                break;
+            } // minimal bound checking
+            *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) | ((data[pos + 1] & 0b00111111) << 6) | (data[pos + 2] & 0b00111111));
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                break;
+            } // minimal bound checking
+            uint32_t code_word = ((leading_byte & 0b00000111) << 18) | ((data[pos + 1] & 0b00111111) << 12)
+                | ((data[pos + 2] & 0b00111111) << 6) | (data[pos + 3] & 0b00111111);
+            *utf32_output++ = char32_t(code_word);
+            pos += 4;
+        } else {
+            // we may have a continuation but we do not do error checking
+            return 0;
+        }
     }
-  }
-  return utf32_output - start;
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace scalar
@@ -11406,7 +12996,7 @@ inline size_t convert_valid(const char* buf, size_t len, char32_t* utf32_output)
 
 #endif
 /* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
 #ifndef SIMDUTF_UTF8_TO_UTF32_H
 #define SIMDUTF_UTF8_TO_UTF32_H
@@ -11416,149 +13006,195 @@ namespace scalar {
 namespace {
 namespace utf8_to_utf32 {
 
-inline size_t convert(const char* buf, size_t len, char32_t* utf32_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if(pos + 1 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return 0; }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if(pos + 2 >= len) { return 0; } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return 0; } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return 0; }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return 0; }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) { return 0; }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
-    } else {
-      return 0;
-    }
-  }
-  return utf32_output - start;
-}
-
-inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output) {
- const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while(pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if(pos + 1 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) { return result(error_code::OVERLONG, pos); }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if(pos + 2 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point) { return result(error_code::OVERLONG, pos); }
-      if (0xd7ff < code_point && code_point < 0xe000) { return result(error_code::SURROGATE, pos); }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if(pos + 3 >= len) { return result(error_code::TOO_SHORT, pos); } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos);}
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) { return result(error_code::TOO_SHORT, pos); }
-
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) { return result(error_code::OVERLONG, pos); }
-      if (0x10ffff < code_point) { return result(error_code::TOO_LARGE, pos); }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) { return result(error_code::TOO_LONG, pos); }
-      else { return result(error_code::HEADER_BITS, pos); }
+inline size_t convert(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf32_output++ = char32_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf32_output++ = char32_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return 0;
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            if (pos + 2 >= len) {
+                return 0;
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if (code_point < 0x800 || 0xffff < code_point || (0xd7ff < code_point && code_point < 0xe000)) {
+                return 0;
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return 0;
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff || 0x10ffff < code_point) {
+                return 0;
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 4;
+        } else {
+            return 0;
+        }
+    }
+    return utf32_output - start;
+}
+
+inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 };
+            if ((v & 0x8080808080808080) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf32_output++ = char32_t(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *utf32_output++ = char32_t(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) {
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+            if (code_point < 0x80 || 0x7ff < code_point) {
+                return result(error_code::OVERLONG, pos);
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            if (pos + 2 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            // range check
+            uint32_t code_point = (leading_byte & 0b00001111) << 12 | (data[pos + 1] & 0b00111111) << 6 | (data[pos + 2] & 0b00111111);
+            if (code_point < 0x800 || 0xffff < code_point) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xd7ff < code_point && code_point < 0xe000) {
+                return result(error_code::SURROGATE, pos);
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 3;
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            if (pos + 3 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+            if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            }
+
+            // range check
+            uint32_t code_point = (leading_byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 | (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+            if (code_point <= 0xffff) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0x10ffff < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            }
+            *utf32_output++ = char32_t(code_point);
+            pos += 4;
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            } else {
+                return result(error_code::HEADER_BITS, pos);
+            }
+        }
     }
-  }
-  return result(error_code::SUCCESS, utf32_output - start);
+    return result(error_code::SUCCESS, utf32_output - start);
 }
 
 /**
@@ -11574,41 +13210,44 @@ inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_o
  * If the error is believed to have occured prior to 'buf', the count value contain in the result
  * will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
  */
-inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  size_t how_far_back = 3; // 3 bytes in the past + current position
-  if(how_far_back > prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-i];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if(found_leading_bytes) {
-      buf -= i;
-      extra_len = i;
-      break;
-    }
-  }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
-  // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if(!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is continuation]
-    // Or we possibly have a stream that does not start with a leading byte.
-    return result(error_code::TOO_LONG, -how_far_back);
-  }
-
-  result res = convert_with_errors(buf, len + extra_len, utf32_output);
-  if (res.error) {
-    res.count -= extra_len;
-  }
-  return res;
+inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output)
+{
+    size_t extra_len { 0 };
+    // We potentially need to go back in time and find a leading byte.
+    size_t how_far_back = 3; // 3 bytes in the past + current position
+    if (how_far_back > prior_bytes) {
+        how_far_back = prior_bytes;
+    }
+    bool found_leading_bytes { false };
+    // important: it is i <= how_far_back and not 'i < how_far_back'.
+    for (size_t i = 0; i <= how_far_back; i++) {
+        unsigned char byte = buf[0 - i];
+        found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+        if (found_leading_bytes) {
+            buf -= i;
+            extra_len = i;
+            break;
+        }
+    }
+    //
+    // It is possible for this function to return a negative count in its result.
+    // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described in C Standard as <stddef.h>.
+    // C Standard Section 4.1.5 defines size_t as an unsigned integral type of the result of the sizeof operator
+    //
+    // An unsigned type will simply wrap round arithmetically (well defined).
+    //
+    if (!found_leading_bytes) {
+        // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+        // [....] [continuation] [continuation] [continuation] | [buf is continuation]
+        // Or we possibly have a stream that does not start with a leading byte.
+        return result(error_code::TOO_LONG, 0 - how_far_back);
+    }
+
+    result res = convert_with_errors(buf, len + extra_len, utf32_output);
+    if (res.error) {
+        res.count -= extra_len;
+    }
+    return res;
 }
 
 } // utf8_to_utf32 namespace
@@ -11618,17 +13257,596 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
 
 #endif
 /* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
-//
 
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1_to_utf8/latin1_to_utf8.h
+/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF8_H
+#define SIMDUTF_LATIN1_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf8 {
+
+inline size_t convert(const char* buf, size_t len, char* utf8_output)
+{
+    const unsigned char* data = reinterpret_cast<const unsigned char*>(buf);
+    size_t pos = 0;
+    char* start { utf8_output };
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *utf8_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        unsigned char byte = data[pos];
+        if ((byte & 0x80) == 0) { // if ASCII
+            // will generate one UTF-8 bytes
+            *utf8_output++ = char(byte);
+            pos++;
+        } else {
+            // will generate two UTF-8 bytes
+            *utf8_output++ = char((byte >> 6) | 0b11000000);
+            *utf8_output++ = char((byte & 0b111111) | 0b10000000);
+            pos++;
+        }
+    }
+    return utf8_output - start;
+}
+
+} // latin1_to_utf8 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1_to_utf16/latin1_to_utf16.h
+/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF16_H
+#define SIMDUTF_LATIN1_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf16 {
+
+template<endianness big_endian>
+inline size_t convert(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+
+    while (pos < len) {
+        uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+        *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+        pos++;
+    }
+
+    return utf16_output - start;
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+
+    while (pos < len) {
+        uint16_t word = uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+        *utf16_output++ = char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+        pos++;
+    }
+
+    return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // latin1_to_utf16 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/latin1_to_utf32/latin1_to_utf32.h
+/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF32_H
+#define SIMDUTF_LATIN1_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf32 {
+
+inline size_t convert(const char* buf, size_t len, char32_t* utf32_output)
+{
+    const unsigned char* data = reinterpret_cast<const unsigned char*>(buf);
+    char32_t* start { utf32_output };
+    for (size_t i = 0; i < len; i++) {
+        *utf32_output++ = (char32_t)data[i];
+    }
+    return utf32_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char32_t* utf32_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char32_t* start { utf32_output };
+    for (size_t i = 0; i < len; i++) {
+        *utf32_output++ = (char32_t)data[i];
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
+}
+
+} // latin1_to_utf32 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_latin1/utf8_to_latin1.h
+/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+#ifndef SIMDUTF_UTF8_TO_LATIN1_H
+#define SIMDUTF_UTF8_TO_LATIN1_H
+#include <iostream>
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert(const char* buf, size_t len, char* latin_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000 .... etc
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        // suppose it is not an all ASCII byte sequence
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *latin_output++ = char(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return 0;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+            // range check -
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+            if (0xFF < code_point) {
+                return 0; // We only care about the range 129-255 which is Non-ASCII latin1 characters
+            }
+            *latin_output++ = char(code_point);
+            pos += 2;
+        } else {
+            return 0;
+        }
+    }
+    return latin_output - start;
+}
+
+inline result convert_with_errors(const char* buf, size_t len, char* latin_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000...etc
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        // suppose it is not an all ASCII byte sequence
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *latin_output++ = char(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                return result(error_code::TOO_SHORT, pos);
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return result(error_code::TOO_SHORT, pos);
+            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+            // range check -
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+            if (code_point < 0x80) {
+                return result(error_code::OVERLONG, pos);
+            }
+            if (0xFF < code_point) {
+                return result(error_code::TOO_LARGE, pos);
+            } // We only care about the range 129-255 which is Non-ASCII latin1 characters
+            *latin_output++ = char(code_point);
+            pos += 2;
+        } else if ((leading_byte & 0b11110000) == 0b11100000) {
+            // We have a three-byte UTF-8
+            return result(error_code::TOO_LARGE, pos);
+        } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+            // we have a 4-byte UTF-8 word.
+            return result(error_code::TOO_LARGE, pos);
+        } else {
+            // we either have too many continuation bytes or an invalid leading byte
+            if ((leading_byte & 0b11000000) == 0b10000000) {
+                return result(error_code::TOO_LONG, pos);
+            }
+
+            return result(error_code::HEADER_BITS, pos);
+        }
+    }
+    return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // utf8_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_latin1/utf16_to_latin1.h
+/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+#ifndef SIMDUTF_UTF16_TO_LATIN1_H
+#define SIMDUTF_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+#include <cstring> // for std::memcpy
+
+template<endianness big_endian>
+inline size_t convert(const char16_t* buf, size_t len, char* latin_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    std::vector<char> temp_output(len);
+    char* current_write = temp_output.data();
+    uint16_t word = 0;
+    uint16_t too_large = 0;
+
+    while (pos < len) {
+        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        too_large |= word;
+        *current_write++ = char(word & 0xFF);
+        pos++;
+    }
+    if ((too_large & 0xFF00) != 0) {
+        return 0;
+    }
+
+    // Only copy to latin_output if there were no errors
+    std::memcpy(latin_output, temp_output.data(), len);
+
+    return current_write - temp_output.data();
+}
+
+template<endianness big_endian>
+inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+    uint16_t word;
+
+    while (pos < len) {
+        if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that they are Latin1
+            uint64_t v1, v2, v3, v4;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
+            ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
+            ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
+
+            if (!match_system(big_endian)) {
+                v1 = (v1 >> 8) | (v1 << (64 - 8));
+            }
+            if (!match_system(big_endian)) {
+                v2 = (v2 >> 8) | (v2 << (64 - 8));
+            }
+            if (!match_system(big_endian)) {
+                v3 = (v3 >> 8) | (v3 << (64 - 8));
+            }
+            if (!match_system(big_endian)) {
+                v4 = (v1 >> 8) | (v4 << (64 - 8));
+            }
+
+            if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = !match_system(big_endian) ? char(utf16::swap_bytes(data[pos])) : char(data[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        if ((word & 0xFF00) == 0) {
+            *latin_output++ = char(word & 0xFF);
+            pos++;
+        } else {
+            return result(error_code::TOO_LARGE, pos);
+        }
+    }
+    return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // utf16_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_latin1/utf32_to_latin1.h
+/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+#ifndef SIMDUTF_UTF32_TO_LATIN1_H
+#define SIMDUTF_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert(const char32_t* buf, size_t len, char* latin1_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char* start = latin1_output;
+    uint32_t utf32_char;
+    size_t pos = 0;
+    uint32_t too_large = 0;
+
+    while (pos < len) {
+        utf32_char = (uint32_t)data[pos];
+        too_large |= utf32_char;
+        *latin1_output++ = (char)(utf32_char & 0xFF);
+        pos++;
+    }
+    if ((too_large & 0xFFFFFF00) != 0) {
+        return 0;
+    }
+    return latin1_output - start;
+}
+
+inline result convert_with_errors(const char32_t* buf, size_t len, char* latin1_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char* start { latin1_output };
+    size_t pos = 0;
+    while (pos < len) {
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+                *latin1_output++ = char(buf[pos]);
+                *latin1_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        uint32_t utf32_char = data[pos];
+        if ((utf32_char & 0xFFFFFF00) == 0) { // Check if the character can be represented in Latin-1
+            *latin1_output++ = (char)(utf32_char & 0xFF);
+            pos++;
+        } else {
+            return result(error_code::TOO_LARGE, pos);
+        };
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
+}
+
+} // utf32_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf8_to_latin1/valid_utf8_to_latin1.h
+/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert_valid(const char* buf, size_t len, char* latin_output)
+{
+    const uint8_t* data = reinterpret_cast<const uint8_t*>(buf);
+
+    size_t pos = 0;
+    char* start { latin_output };
+
+    while (pos < len) {
+        // try to convert the next block of 16 ASCII bytes
+        if (pos + 16 <= len) { // if it is safe to read 16 more bytes, check that they are ascii
+            uint64_t v1;
+            ::memcpy(&v1, data + pos, sizeof(uint64_t));
+            uint64_t v2;
+            ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+            uint64_t v { v1 | v2 }; // We are only interested in these bits: 1000 1000 1000 1000, so it makes sense to concatenate everything
+            if ((v & 0x8080808080808080) == 0) { // if NONE of these are set, e.g. all of them are zero, then everything is ASCII
+                size_t final_pos = pos + 16;
+                while (pos < final_pos) {
+                    *latin_output++ = char(buf[pos]);
+                    pos++;
+                }
+                continue;
+            }
+        }
+
+        // suppose it is not an all ASCII byte sequence
+        uint8_t leading_byte = data[pos]; // leading byte
+        if (leading_byte < 0b10000000) {
+            // converting one ASCII byte !!!
+            *latin_output++ = char(leading_byte);
+            pos++;
+        } else if ((leading_byte & 0b11100000) == 0b11000000) { // the first three bits indicate:
+            // We have a two-byte UTF-8
+            if (pos + 1 >= len) {
+                break;
+            } // minimal bound checking
+            if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+                return 0;
+            } // checks if the next byte is a valid continuation byte in UTF-8. A valid continuation byte starts with 10.
+            // range check -
+            uint32_t code_point = (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111); // assembles the Unicode code point from the two bytes. It does this by discarding the leading 110 and 10 bits from the two bytes, shifting the remaining bits of the first byte, and then combining the results with a bitwise OR operation.
+            *latin_output++ = char(code_point);
+            pos += 2;
+        } else {
+            // we may have a continuation but we do not do error checking
+            return 0;
+        }
+    }
+    return latin_output - start;
+}
+
+} // utf8_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf16_to_latin1/valid_utf16_to_latin1.h
+/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+template<endianness big_endian>
+inline size_t convert_valid(const char16_t* buf, size_t len, char* latin_output)
+{
+    const uint16_t* data = reinterpret_cast<const uint16_t*>(buf);
+    size_t pos = 0;
+    char* start { latin_output };
+    uint16_t word = 0;
+
+    while (pos < len) {
+        word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+        *latin_output++ = char(word);
+        pos++;
+    }
+
+    return latin_output - start;
+}
+
+} // utf16_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=scalar/utf32_to_latin1/valid_utf32_to_latin1.h
+/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert_valid(const char32_t* buf, size_t len, char* latin1_output)
+{
+    const uint32_t* data = reinterpret_cast<const uint32_t*>(buf);
+    char* start = latin1_output;
+    uint32_t utf32_char;
+    size_t pos = 0;
+
+    while (pos < len) {
+        utf32_char = (uint32_t)data[pos];
+
+        if (pos + 2 <= len) { // if it is safe to read 8 more bytes, check that they are Latin1
+            uint64_t v;
+            ::memcpy(&v, data + pos, sizeof(uint64_t));
+            if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+                *latin1_output++ = char(buf[pos]);
+                *latin1_output++ = char(buf[pos + 1]);
+                pos += 2;
+                continue;
+            }
+        }
+        *latin1_output++ = (char)(utf32_char & 0xFF);
+        pos++;
+    }
+    return latin1_output - start;
+}
+
+} // utf32_to_latin1 namespace
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
 
 SIMDUTF_PUSH_DISABLE_WARNINGS
 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 
-
 #if SIMDUTF_IMPLEMENTATION_ARM64
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/implementation.cpp
 /* begin file src/arm64/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/begin.h
 /* begin file src/simdutf/arm64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "arm64"
 // #define SIMDUTF_IMPLEMENTATION arm64
@@ -11641,14 +13859,16 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
     simd8<uint8_t> bits = input.reduce_or();
     return bits.max_val() < 0b10000000u;
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
     simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
-    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+    simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
     // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
     // This will work fine because we only have to report errors for cases with 0-1 lead bytes.
@@ -11658,17 +13878,19 @@ simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd
     return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-    simd8<bool> is_third_byte  = prev2 >= uint8_t(0b11100000u);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
     simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
     return is_third_byte ^ is_fourth_byte;
 }
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_detect_encodings.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_detect_encodings.cpp
 /* begin file src/arm64/arm_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int arm_detect_encodings(const char * buf, size_t len) {
+int arm_detect_encodings(const char* buf, size_t len)
+{
     const char* start = buf;
     const char* end = buf + len;
 
@@ -11683,13 +13905,13 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
     uint32x4_t currentmax = vmovq_n_u32(0x0);
 
-    checker check{};
+    checker check {};
 
-    while(buf + 64 <= end) {
+    while (buf + 64 <= end) {
         uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
-        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3*simd16<uint16_t>::SIZE / sizeof(char16_t));
+        uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2 * simd16<uint16_t>::SIZE / sizeof(char16_t));
+        uint16x8_t fourthin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 3 * simd16<uint16_t>::SIZE / sizeof(char16_t));
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(secondin);
@@ -11721,15 +13943,15 @@ int arm_detect_encodings(const char * buf, size_t len) {
                 is_utf32 = false;
                 // Code from arm_validate_utf16le.cpp
                 // Not efficient, we do not process surrogates_wordmask1
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
+                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint64_t V0 = ~surrogates_wordmask0;
 
-                const auto vH0 = ((in16 & v_fc) ==  v_dc);
+                const auto vH0 = ((in16 & v_fc) == v_dc);
                 const uint64_t H0 = vH0.to_bitmask64();
 
                 const uint64_t L0 = ~H0 & surrogates_wordmask0;
@@ -11756,12 +13978,12 @@ int arm_detect_encodings(const char * buf, size_t len) {
                     const simd8<uint8_t> in_16 = simd16<uint16_t>::pack(t0, t1);
 
                     const uint64_t surrogates_wordmask = ((in_16 & v_f8) == v_d8).to_bitmask64();
-                    if(surrogates_wordmask == 0) {
+                    if (surrogates_wordmask == 0) {
                         input += 16;
                     } else {
                         const uint64_t V = ~surrogates_wordmask;
 
-                        const auto vH = ((in_16 & v_fc) ==  v_dc);
+                        const auto vH = ((in_16 & v_fc) == v_dc);
                         const uint64_t H = vH.to_bitmask64();
 
                         const uint64_t L = ~H & surrogates_wordmask;
@@ -11785,23 +14007,23 @@ int arm_detect_encodings(const char * buf, size_t len) {
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
+                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
 
                     // Must start checking for surrogates
                     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
                     const uint32x4_t offset = vmovq_n_u32(0xffff2000);
                     const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
 
-                    const uint32x4_t in32 =  vreinterpretq_u32_u16(in);
-                    const uint32x4_t secondin32 =  vreinterpretq_u32_u16(secondin);
-                    const uint32x4_t thirdin32 =  vreinterpretq_u32_u16(thirdin);
-                    const uint32x4_t fourthin32 =  vreinterpretq_u32_u16(fourthin);
+                    const uint32x4_t in32 = vreinterpretq_u32_u16(in);
+                    const uint32x4_t secondin32 = vreinterpretq_u32_u16(secondin);
+                    const uint32x4_t thirdin32 = vreinterpretq_u32_u16(thirdin);
+                    const uint32x4_t fourthin32 = vreinterpretq_u32_u16(fourthin);
 
-                    currentmax = vmaxq_u32(in32,currentmax);
-                    currentmax = vmaxq_u32(secondin32,currentmax);
-                    currentmax = vmaxq_u32(thirdin32,currentmax);
-                    currentmax = vmaxq_u32(fourthin32,currentmax);
+                    currentmax = vmaxq_u32(in32, currentmax);
+                    currentmax = vmaxq_u32(secondin32, currentmax);
+                    currentmax = vmaxq_u32(thirdin32, currentmax);
+                    currentmax = vmaxq_u32(fourthin32, currentmax);
 
                     currentoffsetmax = vmaxq_u32(vaddq_u32(in32, offset), currentoffsetmax);
                     currentoffsetmax = vmaxq_u32(vaddq_u32(secondin32, offset), currentoffsetmax);
@@ -11810,13 +14032,13 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
                     while (input + 4 < end32) {
                         const uint32x4_t in_32 = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-                        currentmax = vmaxq_u32(in_32,currentmax);
+                        currentmax = vmaxq_u32(in_32, currentmax);
                         currentoffsetmax = vmaxq_u32(vaddq_u32(in_32, offset), currentoffsetmax);
                         input += 4;
                     }
 
                     uint32x4_t forbidden_words = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(vmaxvq_u32(forbidden_words) != 0) {
+                    if (vmaxvq_u32(forbidden_words) != 0) {
                         is_utf32 = false;
                     }
                 } else {
@@ -11828,10 +14050,10 @@ int arm_detect_encodings(const char * buf, size_t len) {
         // If no surrogate, validate under other encodings as well
 
         // UTF-32 validation
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin),currentmax);
-        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin),currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(in), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(secondin), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(thirdin), currentmax);
+        currentmax = vmaxq_u32(vreinterpretq_u32_u16(fourthin), currentmax);
 
         // UTF-8 validation
         // Relies on ../generic/utf8_validation/utf8_lookup4_algorithm.h
@@ -11845,7 +14067,7 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
+            uint8_t block[64] {};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -11856,14 +14078,14 @@ int arm_detect_encodings(const char * buf, size_t len) {
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
+        if (vmaxvq_u32(is_zero) == 0 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -11872,10 +14094,11 @@ int arm_detect_encodings(const char * buf, size_t len) {
 }
 /* end file src/arm64/arm_detect_encodings.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_validate_utf16.cpp
 /* begin file src/arm64/arm_validate_utf16.cpp */
-template <endianness big_endian>
-const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const char16_t* arm_validate_utf16(const char16_t* input, size_t size)
+{
     const char16_t* end = input + size;
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
@@ -11888,11 +14111,11 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
         auto in0 = simd16<uint16_t>(input);
         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
         if (!match_system(big_endian)) {
-            #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            #else
-            const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-            #endif
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
         }
@@ -11901,7 +14124,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-        if(surrogates_wordmask == 0) {
+        if (surrogates_wordmask == 0) {
             input += 16;
         } else {
             // 2. We have some surrogates that have to be distinguished:
@@ -11915,7 +14138,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
             const uint64_t V = ~surrogates_wordmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = ((in & v_fc) ==  v_dc);
+            const auto vH = ((in & v_fc) == v_dc);
             const uint64_t H = vH.to_bitmask64();
 
             // L - word mask for low surrogates
@@ -11923,11 +14146,11 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
             const uint64_t L = ~H & surrogates_wordmask;
 
             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
-                              // (A low surrogate placed in the 7th register's word
-                              // is an exception we handle.)
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
-                          // thanks to that we have only two masks for valid case.
-            const uint64_t c = V | a | b;      // Combine all the masks into the final one.
+                                       // thanks to that we have only two masks for valid case.
+            const uint64_t c = V | a | b; // Combine all the masks into the final one.
             if (c == ~0ull) {
                 // The whole input register contains valid UTF-16, i.e.,
                 // either single words or proper surrogate pairs.
@@ -11946,9 +14169,9 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
     return input;
 }
 
-
-template <endianness big_endian>
-const result arm_validate_utf16_with_errors(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
+{
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -11964,11 +14187,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
         auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
 
         if (!match_system(big_endian)) {
-            #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
             const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-            #else
-            const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-            #endif
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
             in0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in0), swap));
             in1 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in1), swap));
         }
@@ -11977,7 +14200,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
         const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
         // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
         const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-        if(surrogates_wordmask == 0) {
+        if (surrogates_wordmask == 0) {
             input += 16;
         } else {
             // 2. We have some surrogates that have to be distinguished:
@@ -11991,7 +14214,7 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint64_t V = ~surrogates_wordmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto vH = ((in & v_fc) ==  v_dc);
+            const auto vH = ((in & v_fc) == v_dc);
             const uint64_t H = vH.to_bitmask64();
 
             // L - word mask for low surrogates
@@ -11999,11 +14222,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint64_t L = ~H & surrogates_wordmask;
 
             const uint64_t a = L & (H >> 4); // A low surrogate must be followed by high one.
-                              // (A low surrogate placed in the 7th register's word
-                              // is an exception we handle.)
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
             const uint64_t b = a << 4; // Just mark that the opposite fact is hold,
-                          // thanks to that we have only two masks for valid case.
-            const uint64_t c = V | a | b;      // Combine all the masks into the final one.
+                                       // thanks to that we have only two masks for valid case.
+            const uint64_t c = V | a | b; // Combine all the masks into the final one.
             if (c == ~0ull) {
                 // The whole input register contains valid UTF-16, i.e.,
                 // either single words or proper surrogate pairs.
@@ -12022,10 +14245,11 @@ const result arm_validate_utf16_with_errors(const char16_t* input, size_t size)
     return result(error_code::SUCCESS, input - start);
 }
 /* end file src/arm64/arm_validate_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_validate_utf32le.cpp
 /* begin file src/arm64/arm_validate_utf32le.cpp */
 
-const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
+const char32_t* arm_validate_utf32le(const char32_t* input, size_t size)
+{
     const char32_t* end = input + size;
 
     const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
@@ -12036,26 +14260,26 @@ const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
 
     while (input + 4 < end) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-        currentmax = vmaxq_u32(in,currentmax);
+        currentmax = vmaxq_u32(in, currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
         input += 4;
     }
 
     uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-    if(vmaxvq_u32(is_zero) != 0) {
+    if (vmaxvq_u32(is_zero) != 0) {
         return nullptr;
     }
 
     is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if(vmaxvq_u32(is_zero) != 0) {
+    if (vmaxvq_u32(is_zero) != 0) {
         return nullptr;
     }
 
     return input;
 }
 
-
-const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size)
+{
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -12067,16 +14291,16 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
 
     while (input + 4 < end) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
-        currentmax = vmaxq_u32(in,currentmax);
+        currentmax = vmaxq_u32(in, currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
 
         uint32x4_t is_zero = veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-        if(vmaxvq_u32(is_zero) != 0) {
+        if (vmaxvq_u32(is_zero) != 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if(vmaxvq_u32(is_zero) != 0) {
+        if (vmaxvq_u32(is_zero) != 0) {
             return result(error_code::SURROGATE, input - start);
         }
 
@@ -12087,316 +14311,308 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
 }
 /* end file src/arm64/arm_validate_utf32le.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf8_to_utf16.cpp
 /* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  #else
-  const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-  #endif
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-    // We process in chunks of 16 bytes
-    uint16x8_t ascii_first = vmovl_u8(vget_low_u8 (in));
-    uint16x8_t ascii_second = vmovl_high_u8(in);
-    if (!match_system(big_endian)) {
-      ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
-      ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
-    }
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    uint8x16_t perm = vqtbl1q_u8(in, swap);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
-    vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
+template<endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char16_t*& utf16_output)
+{
+// we use an approach where we try to process up to 12 input bytes.
+// Why 12 input bytes and not 16? Because we are concerned with the size of
+// the lookup tables. Also 12 is nicely divisible by two and three.
+//
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+    const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-    const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
+    const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
-    if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
-    utf16_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-
-
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    if (!match_system(big_endian)) composed = vqtbl1q_u8(composed, swap);
-    vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
-    if (!match_system(big_endian)) composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
-    vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
-    utf16_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
-    uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
-    // correct for spurious high bit
-    uint8x16_t correct =
-        vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
-    middlehighbyte = veorq_u8(correct, middlehighbyte);
-    uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
-    uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
-    uint8x16_t composed =
-        vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
-                     vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
-    uint32x4_t composedminus =
-        vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
-    uint32x4_t lowtenbits =
-        vandq_u32(composedminus, vmovq_n_u32(0x3ff));
-    uint32x4_t hightenbits = vshrq_n_u32(composedminus, 10);
-    uint32x4_t lowtenbitsadd =
-        vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
-    uint32x4_t hightenbitsadd =
-        vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
-    uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
-    uint32x4_t surrogates =
-        vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (!match_system(big_endian)) {
-      vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
-      surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
-    }
-    vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
-    uint32_t surrogate_buffer[4];
-    vst1q_u32(surrogate_buffer, surrogates);
-    for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] < 65536) {
-        utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-        utf16_output++;
-      } else {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-        utf16_output += 2;
-      }
-    }
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+    uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+        // We process in chunks of 16 bytes
+        uint16x8_t ascii_first = vmovl_u8(vget_low_u8(in));
+        uint16x8_t ascii_second = vmovl_high_u8(in);
+        if (!match_system(big_endian)) {
+            ascii_first = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_first), swap));
+            ascii_second = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(ascii_second), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), ascii_first);
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output) + 8, ascii_second);
+        utf16_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        uint8x16_t perm = vqtbl1q_u8(in, swap);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        if (!match_system(big_endian)) {
+            composed = vqtbl1q_u8(composed, swap);
+        }
+        vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
+        utf16_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+#else
+        const uint8x16_t sh = { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 };
+#endif
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
+        if (!match_system(big_endian)) {
+            composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
+        utf16_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes.
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        if (!match_system(big_endian)) {
+            composed = vqtbl1q_u8(composed, swap);
+        }
+        vst1q_u8(reinterpret_cast<uint8_t*>(utf16_output), composed);
+        utf16_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        uint16x8_t composed_repacked = vmovn_high_u32(vmovn_u32(composed), composed);
+        if (!match_system(big_endian)) {
+            composed_repacked = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(composed_repacked), swap));
+        }
+        vst1q_u16(reinterpret_cast<uint16_t*>(utf16_output), composed_repacked);
+        utf16_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        //////////////
+        // There might be garbage inputs where a leading byte mascarades as a four-byte
+        // leading byte (by being followed by 3 continuation byte), but is not greater than
+        // 0xf0. This could trigger a buffer overflow if we only counted leading
+        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+        // We do as at the cost of an extra mask.
+        /////////////
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
+        // correct for spurious high bit
+        uint8x16_t correct = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
+        middlehighbyte = veorq_u8(correct, middlehighbyte);
+        uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
+        // We deliberately carry the leading four bits if they are present, we remove
+        // them later when computing hightenbits.
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0xff000000)));
+        uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
+        // When we need to generate a surrogate pair (leading byte > 0xF0), then
+        // the corresponding 32-bit value in 'composed'  will be greater than
+        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+        // location of the surrogate pairs.
+        uint8x16_t composed = vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
+            vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
+        uint32x4_t composedminus = vsubq_u32(vreinterpretq_u32_u8(composed), vmovq_n_u32(0x10000));
+        uint32x4_t lowtenbits = vandq_u32(composedminus, vmovq_n_u32(0x3ff));
+        // Notice the 0x3ff mask:
+        uint32x4_t hightenbits = vandq_u32(vshrq_n_u32(composedminus, 10), vmovq_n_u32(0x3ff));
+        uint32x4_t lowtenbitsadd = vaddq_u32(lowtenbits, vmovq_n_u32(0xDC00));
+        uint32x4_t hightenbitsadd = vaddq_u32(hightenbits, vmovq_n_u32(0xD800));
+        uint32x4_t lowtenbitsaddshifted = vshlq_n_u32(lowtenbitsadd, 16);
+        uint32x4_t surrogates = vorrq_u32(hightenbitsadd, lowtenbitsaddshifted);
+        uint32_t basic_buffer[4];
+        uint32_t basic_buffer_swap[4];
+        if (!match_system(big_endian)) {
+            vst1q_u32(basic_buffer_swap, vreinterpretq_u32_u8(vqtbl1q_u8(composed, swap)));
+            surrogates = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(surrogates), swap));
+        }
+        vst1q_u32(basic_buffer, vreinterpretq_u32_u8(composed));
+        uint32_t surrogate_buffer[4];
+        vst1q_u32(surrogate_buffer, surrogates);
+        for (size_t i = 0; i < 3; i++) {
+            if (basic_buffer[i] > 0x3c00000) {
+                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+                utf16_output += 2;
+            } else {
+                utf16_output[0] = !match_system(big_endian) ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+                utf16_output++;
+            }
+        }
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf8_to_utf32.cpp
 /* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char32_t *&utf32_out) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xFFF;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
-    // We process in chunks of 16 bytes
-    vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8 (in)))));
-    vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8 (in))));
-    vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
-    vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
-    utf32_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
+size_t convert_masked_utf8_to_utf32(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char32_t*& utf32_out)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    uint32_t*& utf32_output = reinterpret_cast<uint32_t*&>(utf32_out);
+    uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t*>(input));
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xFFF;
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+        // We process in chunks of 16 bytes
+        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(in)))));
+        vst1q_u32(utf32_output + 4, vmovl_high_u16(vmovl_u8(vget_low_u8(in))));
+        vst1q_u32(utf32_output + 8, vmovl_u16(vget_low_u16(vmovl_high_u8(in))));
+        vst1q_u32(utf32_output + 12, vmovl_high_u16(vmovl_high_u8(in)));
+        utf32_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if ((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const uint8x16_t sh = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-    //const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-    const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        // const uint8x16_t sh = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        const uint8x16_t sh = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
-    vst1q_u32(utf32_output+4,  vmovl_high_u16(vreinterpretq_u16_u8(composed)));
-    utf32_output += 8; // We wrote 32 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
+        vst1q_u32(utf32_output + 4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
+        utf32_output += 8; // We wrote 32 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
+        const uint8x16_t sh = make_uint8x16_t(2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255);
 #else
-    const uint8x16_t sh = {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255};
+        const uint8x16_t sh = { 2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255 };
 #endif
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-
-
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
-    uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
-    vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
-    vst1q_u32(utf32_output+4,  vmovl_high_u16(vreinterpretq_u16_u8(composed)));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
-    uint8x16_t middlebyte =
-        vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint32x4_t highbyte =
-        vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
-    uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
-    uint32x4_t composed =
-        vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    uint8x16_t perm = vqtbl1q_u8(in, sh);
-    uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
-    uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
-    uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
-    uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
-    // correct for spurious high bit
-    uint8x16_t correct =
-        vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
-    middlehighbyte = veorq_u8(correct, middlehighbyte);
-    uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
-    uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
-    uint8x16_t highbyte_shifted =vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
-    uint8x16_t composed =
-        vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
-                     vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
-    vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
-    utf32_output += 3;
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        vst1q_u32(utf32_output, composed);
+        utf32_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes.
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x7f)));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u16(vmovq_n_u16(0x1f00)));
+        uint8x16_t composed = vorrq_u8(ascii, vreinterpretq_u8_u16(vshrq_n_u16(vreinterpretq_u16_u8(highbyte), 2)));
+        vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(vreinterpretq_u16_u8(composed))));
+        vst1q_u32(utf32_output + 4, vmovl_high_u16(vreinterpretq_u16_u8(composed)));
+        utf32_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f))); // 7 or 6 bits
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00))); // 5 or 6 bits
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint32x4_t highbyte = vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x0f0000)))); // 4 bits
+        uint32x4_t highbyte_shifted = vshrq_n_u32(highbyte, 4);
+        uint32x4_t composed = vorrq_u32(vorrq_u32(vreinterpretq_u32_u8(ascii), vreinterpretq_u32_u8(middlebyte_shifted)), highbyte_shifted);
+        vst1q_u32(utf32_output, composed);
+        utf32_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t*>(simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        uint8x16_t perm = vqtbl1q_u8(in, sh);
+        uint8x16_t ascii = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x7f)));
+        uint8x16_t middlebyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f00)));
+        uint8x16_t middlebyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlebyte), 2));
+        uint8x16_t middlehighbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x3f0000)));
+        // correct for spurious high bit
+        uint8x16_t correct = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x400000)))), 1));
+        middlehighbyte = veorq_u8(correct, middlehighbyte);
+        uint8x16_t middlehighbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(middlehighbyte), 4));
+        uint8x16_t highbyte = vandq_u8(perm, vreinterpretq_u8_u32(vmovq_n_u32(0x07000000)));
+        uint8x16_t highbyte_shifted = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(highbyte), 6));
+        uint8x16_t composed = vorrq_u8(vorrq_u8(ascii, middlebyte_shifted),
+            vorrq_u8(highbyte_shifted, middlehighbyte_shifted));
+        vst1q_u32(utf32_output, vreinterpretq_u32_u8(composed));
+        utf32_output += 3;
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf16_to_utf8.cpp
 /* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -12450,533 +14666,540 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-        // It is common enough that we have sequences of 16 consecutive ASCII characters.
-        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+template<endianness big_endian>
+std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         if (!match_system(big_endian)) {
-          #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          #else
-          const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-          #endif
-          nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
-        }
-        if(vmaxvq_u16(nextin) > 0x7F) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(in);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-          // 2. store (16 bytes)
-          vst1q_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    if (vmaxvq_u16(in) <= 0x7FF) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const uint16x8_t t0 = vshlq_n_u16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const uint16x8_t t2 = vandq_u16(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const uint16x8_t t3 = vorrq_u16(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-          // 2. merge ASCII and 2-byte codewords
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-          // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080);
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-          const uint16x8_t mask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080 };
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const uint8x16_t shuffle = vld1q_u8(row + 1);
-          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          vst1q_u8(utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+        if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+            // It is common enough that we have sequences of 16 consecutive ASCII characters.
+            uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 8);
+            if (!match_system(big_endian)) {
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+                const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
 #else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+                const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
 #endif
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(in, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+                nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
+            }
+            if (vmaxvq_u16(nextin) > 0x7F) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(in);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+                // 2. store (16 bytes)
+                vst1q_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        if (vmaxvq_u16(in) <= 0x7FF) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const uint16x8_t t0 = vshlq_n_u16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const uint16x8_t t2 = vandq_u16(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const uint16x8_t t3 = vorrq_u16(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+            // 2. merge ASCII and 2-byte codewords
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+            // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 );
-        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 );
+            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080);
 #else
-        const uint16x8_t onemask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 };
-        const uint16x8_t twomask = { 0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 };
+            const uint16x8_t mask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080 };
 #endif
-        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const uint8x16_t shuffle = vld1q_u8(row + 1);
+            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            vst1q_u8(utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+            const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+#endif
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+            // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+            const uint16x8_t s0 = vshrq_n_u16(in, 12);
+            // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+            const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+            // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+            const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+            // [00bb|bbbb|0000|aaaa]
+            const uint16x8_t s2 = vorrq_u16(s0, s1s);
+            // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+            const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+            const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+            const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+            const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+            const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000);
+            const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000);
+#else
+            const uint16x8_t onemask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000 };
+            const uint16x8_t twomask = { 0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000 };
+#endif
+            const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+            const uint16_t mask = vaddvq_u16(combined);
+            // The following fast path may or may not be beneficial.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+              const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+              const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+              vst1q_u8(utf8_output, utf8_0);
+              utf8_output += 12;
+              vst1q_u8(utf8_output, utf8_1);
+              utf8_output += 12;
+              buf += 8;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+            vst1q_u8(utf8_output, utf8_0);
+            utf8_output += row0[0];
+            vst1q_u8(utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+    return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+template<endianness big_endian>
+std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
     const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-    if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-        // It is common enough that we have sequences of 16 consecutive ASCII characters.
-        uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         if (!match_system(big_endian)) {
-          #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          #else
-          const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-          #endif
-          nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
-        }
-        if(vmaxvq_u16(nextin) > 0x7F) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(in);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-          // 2. store (16 bytes)
-          vst1q_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    if (vmaxvq_u16(in) <= 0x7FF) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-          const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const uint16x8_t t0 = vshlq_n_u16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const uint16x8_t t2 = vandq_u16(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const uint16x8_t t3 = vorrq_u16(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-          // 2. merge ASCII and 2-byte codewords
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-          const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-          // 3. prepare bitmask for 8-bit lookup
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080);
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+        if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+            // It is common enough that we have sequences of 16 consecutive ASCII characters.
+            uint16x8_t nextin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 8);
+            if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+                const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+                nextin = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(nextin), swap));
+            }
+            if (vmaxvq_u16(nextin) > 0x7F) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(in);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+                // 2. store (16 bytes)
+                vst1q_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        if (vmaxvq_u16(in) <= 0x7FF) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const uint16x8_t t0 = vshlq_n_u16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const uint16x8_t t2 = vandq_u16(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const uint16x8_t t3 = vorrq_u16(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+            // 2. merge ASCII and 2-byte codewords
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+            // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080);
 #else
-          const uint16x8_t mask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0002, 0x0008,
-                                    0x0020, 0x0080 };
+            const uint16x8_t mask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0002, 0x0008,
+                0x0020, 0x0080 };
 #endif
-          uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const uint8x16_t shuffle = vld1q_u8(row + 1);
-          const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          vst1q_u8(utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const uint8x16_t shuffle = vld1q_u8(row + 1);
+            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            vst1q_u8(utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+            const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 #else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+            const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
 #endif
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(in, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 = vandq_u16(in, vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+            // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+            const uint16x8_t s0 = vshrq_n_u16(in, 12);
+            // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+            const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+            // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+            const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+            // [00bb|bbbb|0000|aaaa]
+            const uint16x8_t s2 = vorrq_u16(s0, s1s);
+            // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+            const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+            const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+            const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+            const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+            const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+            const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
 #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 );
-        const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 );
+            const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000);
+            const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000);
 #else
-        const uint16x8_t onemask = { 0x0001, 0x0004,
-                                    0x0010, 0x0040,
-                                    0x0100, 0x0400,
-                                    0x1000, 0x4000 };
-        const uint16x8_t twomask = { 0x0002, 0x0008,
-                                    0x0020, 0x0080,
-                                    0x0200, 0x0800,
-                                    0x2000, 0x8000 };
+            const uint16x8_t onemask = { 0x0001, 0x0004,
+                0x0010, 0x0040,
+                0x0100, 0x0400,
+                0x1000, 0x4000 };
+            const uint16x8_t twomask = { 0x0002, 0x0008,
+                0x0020, 0x0080,
+                0x0200, 0x0800,
+                0x2000, 0x8000 };
 #endif
-        const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+            const uint16_t mask = vaddvq_u16(combined);
+            // The following fast path may or may not be beneficial.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+              const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+              const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+              vst1q_u8(utf8_output, utf8_0);
+              utf8_output += 12;
+              vst1q_u8(utf8_output, utf8_1);
+              utf8_output += 12;
+              buf += 8;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+            vst1q_u8(utf8_output, utf8_0);
+            utf8_output += row0[0];
+            vst1q_u8(utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char*>(utf8_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
 }
 /* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf16_to_utf32.cpp
 /* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -13030,736 +15253,770 @@ std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* b
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out) {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit words to 32-bit words
-      vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_out)
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
+        if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: no surrogate pairs, extend all 16-bit words to 32-bit words
+            vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+            vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char32_t*>(utf32_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
+    } // while
+    return std::make_pair(buf, reinterpret_cast<char32_t*>(utf32_output));
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out) {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      #else
-      const uint8x16_t swap = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
-      #endif
-      in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
-    }
-
-    const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-      if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit words to 32-bit words
-      vst1q_u32(utf32_output,  vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output+4,  vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_out)
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+    const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
+        if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+            const uint8x16_t swap = make_uint8x16_t(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+#else
+            const uint8x16_t swap = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+#endif
+            in = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(in), swap));
+        }
+
+        const uint16x8_t surrogates_bytemask = vceqq_u16(vandq_u16(in, v_f800), v_d800);
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (vmaxvq_u16(surrogates_bytemask) == 0) {
+            // case: no surrogate pairs, extend all 16-bit words to 32-bit words
+            vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+            vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output)); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), reinterpret_cast<char32_t*>(utf32_output));
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char32_t*>(utf32_output));
 }
 /* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf32_to_utf8.cpp
 /* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-  const char32_t* end = buf + len;
-
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
-
-  while (buf + 16 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          continue; // we are done for this round!
-      }
-
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+std::pair<const char32_t*, char*> arm_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+    const char32_t* end = buf + len;
+
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+
+    while (buf + 16 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+        uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t*>(buf + 4));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+            // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+            // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+            uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+            if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                continue; // we are done for this round!
+            }
 
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const uint16x8_t t3 = vorrq_u16(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-            // 2. merge ASCII and 2-byte codewords
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-            // 3. prepare bitmask for 8-bit lookup
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080);
-  #else
-            const uint16x8_t mask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080 };
-  #endif
-            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const uint8x16_t shuffle = vld1q_u8(row + 1);
-            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+            if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+                // 1. prepare 2-byte values
+                // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+                // expected output   : [110a|aaaa|10bb|bbbb] x 8
+                const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+                const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+                // t0 = [000a|aaaa|bbbb|bb00]
+                const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+                // t1 = [000a|aaaa|0000|0000]
+                const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+                // t2 = [0000|0000|00bb|bbbb]
+                const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+                // t3 = [000a|aaaa|00bb|bbbb]
+                const uint16x8_t t3 = vorrq_u16(t1, t2);
+                // t4 = [110a|aaaa|10bb|bbbb]
+                const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+                // 2. merge ASCII and 2-byte codewords
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+                const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+                // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080);
+#else
+                const uint16x8_t mask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080 };
+#endif
+                uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+                // 4. pack the bytes
+                const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+                const uint8x16_t shuffle = vld1q_u8(row + 1);
+                const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
 
-            // 5. store bytes
-            vst1q_u8(utf8_output, utf8_packed);
+                // 5. store bytes
+                vst1q_u8(utf8_output, utf8_packed);
 
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
+                // 6. adjust pointers
+                buf += 8;
+                utf8_output += row[0];
+                continue;
 
-      } else {
-        // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
-
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-  #else
-          const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-  #endif
-          /* In this branch we handle three cases:
-            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-            We expand the input word (16-bit) into two words (32-bit), thus
-            we have room for four bytes. However, we need five distinct bit
-            layouts. Note that the last byte in cases #2 and #3 is the same.
-
-            We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-            in register t2.
-
-            We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-            either byte 1 for case #2 or byte 2 for case #3. Note that they
-            differ by exactly one bit.
-
-            Finally from these two words we build proper UTF-8 sequence, taking
-            into account the case (i.e, the number of bytes to write).
-          */
-          /**
-           * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-           * t2 => [0ccc|cccc] [10cc|cccc]
-           * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-           */
-  #define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-          // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-          const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
-          // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-          const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-          // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-          const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-          // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-          const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-          // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-          const uint16x8_t s1 = vandq_u16(utf16_packed, vec(0b0000111111000000));
-          // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-          const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-          // [00bb|bbbb|0000|aaaa]
-          const uint16x8_t s2 = vorrq_u16(s0, s1s);
-          // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-          const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-          const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-          const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
-          const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-          const uint16x8_t s4 = veorq_u16(s3, m0);
-  #undef vec
-
-          // 4. expand words 16-bit => 32-bit
-          const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-          const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-          // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 );
-          const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 );
-  #else
-          const uint16x8_t onemask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 };
-          const uint16x8_t twomask = { 0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 };
-  #endif
-          const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-          const uint16_t mask = vaddvq_u16(combined);
-          // The following fast path may or may not be beneficial.
-          /*if(mask == 0) {
-            // We only have three-byte words. Use fast path.
-            const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-            vst1q_u8(utf8_output, utf8_0);
-            utf8_output += 12;
-            vst1q_u8(utf8_output, utf8_1);
-            utf8_output += 12;
-            buf += 8;
-            continue;
-          }*/
-          const uint8_t mask0 = uint8_t(mask);
-
-          const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-          const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-          const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-          const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-          const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += row0[0];
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += row1[0];
-
-          buf += 8;
-      }
-    // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000)==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            } else {
+                // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+                const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+                const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+                forbidden_bytemask = vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800)), forbidden_bytemask);
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+                const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+#endif
+                /* In this branch we handle three cases:
+                  1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                  2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                  3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+                  We expand the input word (16-bit) into two words (32-bit), thus
+                  we have room for four bytes. However, we need five distinct bit
+                  layouts. Note that the last byte in cases #2 and #3 is the same.
+
+                  We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+                  in register t2.
+
+                  We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+                  either byte 1 for case #2 or byte 2 for case #3. Note that they
+                  differ by exactly one bit.
+
+                  Finally from these two words we build proper UTF-8 sequence, taking
+                  into account the case (i.e, the number of bytes to write).
+                */
+                /**
+                 * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+                 * t2 => [0ccc|cccc] [10cc|cccc]
+                 * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+                 */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+                // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+                const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
+                // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+                const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+                // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+                const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+                // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+                const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+                // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+                const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+                // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+                const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+                // [00bb|bbbb|0000|aaaa]
+                const uint16x8_t s2 = vorrq_u16(s0, s1s);
+                // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+                const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+                const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+                const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
+                const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+                const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+                // 4. expand words 16-bit => 32-bit
+                const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+                const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+                // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000);
+                const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000);
+#else
+                const uint16x8_t onemask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000 };
+                const uint16x8_t twomask = { 0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000 };
+#endif
+                const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+                const uint16_t mask = vaddvq_u16(combined);
+                // The following fast path may or may not be beneficial.
+                /*if(mask == 0) {
+                  // We only have three-byte words. Use fast path.
+                  const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+                  const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+                  const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+                  vst1q_u8(utf8_output, utf8_0);
+                  utf8_output += 12;
+                  vst1q_u8(utf8_output, utf8_1);
+                  utf8_output += 12;
+                  buf += 8;
+                  continue;
+                }*/
+                const uint8_t mask0 = uint8_t(mask);
+                const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+                const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+                const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+                const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+                const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+                const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+                const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+                vst1q_u8(utf8_output, utf8_0);
+                utf8_output += row0[0];
+                vst1q_u8(utf8_output, utf8_1);
+                utf8_output += row1[0];
+
+                buf += 8;
+            }
+            // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
         } else {
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  if (vmaxvq_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
-  }
-  return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
-}
-
-
-std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out) {
-  uint8_t * utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
-
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-
-  while (buf + 16 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if(vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-          // 1. pack the bytes
-          // obviously suboptimal.
-          uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-          // 2. store (8 bytes)
-          vst1_u8(utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          continue; // we are done for this round!
-      }
-
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-            // 1. prepare 2-byte values
-            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-            // expected output   : [110a|aaaa|10bb|bbbb] x 8
-            const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-            const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    // check for invalid input
+    if (vmaxvq_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(nullptr, reinterpret_cast<char*>(utf8_output));
+    }
+    return std::make_pair(buf, reinterpret_cast<char*>(utf8_output));
+}
+
+std::pair<result, char*> arm_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_out)
+{
+    uint8_t* utf8_output = reinterpret_cast<uint8_t*>(utf8_out);
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
+
+    const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+    while (buf + 16 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+        uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t*>(buf + 4));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+            // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+            // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+            uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+            if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+                // 1. pack the bytes
+                // obviously suboptimal.
+                uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+                // 2. store (8 bytes)
+                vst1_u8(utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                continue; // we are done for this round!
+            }
 
-            // t0 = [000a|aaaa|bbbb|bb00]
-            const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-            // t1 = [000a|aaaa|0000|0000]
-            const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-            // t2 = [0000|0000|00bb|bbbb]
-            const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-            // t3 = [000a|aaaa|00bb|bbbb]
-            const uint16x8_t t3 = vorrq_u16(t1, t2);
-            // t4 = [110a|aaaa|10bb|bbbb]
-            const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-            // 2. merge ASCII and 2-byte codewords
-            const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-            const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-            const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-            // 3. prepare bitmask for 8-bit lookup
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-            const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080);
-  #else
-            const uint16x8_t mask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0002, 0x0008,
-                                      0x0020, 0x0080 };
-  #endif
-            uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-            // 4. pack the bytes
-            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-            const uint8x16_t shuffle = vld1q_u8(row + 1);
-            const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+            if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+                // 1. prepare 2-byte values
+                // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+                // expected output   : [110a|aaaa|10bb|bbbb] x 8
+                const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+                const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+                // t0 = [000a|aaaa|bbbb|bb00]
+                const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+                // t1 = [000a|aaaa|0000|0000]
+                const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+                // t2 = [0000|0000|00bb|bbbb]
+                const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+                // t3 = [000a|aaaa|00bb|bbbb]
+                const uint16x8_t t3 = vorrq_u16(t1, t2);
+                // t4 = [110a|aaaa|10bb|bbbb]
+                const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+                // 2. merge ASCII and 2-byte codewords
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+                const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+                // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t mask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080);
+#else
+                const uint16x8_t mask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0002, 0x0008,
+                    0x0020, 0x0080 };
+#endif
+                uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+                // 4. pack the bytes
+                const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+                const uint8x16_t shuffle = vld1q_u8(row + 1);
+                const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
 
-            // 5. store bytes
-            vst1q_u8(utf8_output, utf8_packed);
+                // 5. store bytes
+                vst1q_u8(utf8_output, utf8_packed);
 
-            // 6. adjust pointers
-            buf += 8;
-            utf8_output += row[0];
-            continue;
+                // 6. adjust pointers
+                buf += 8;
+                utf8_output += row[0];
+                continue;
 
-      } else {
-        // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-        // check for invalid input
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
-        if (vmaxvq_u16(forbidden_bytemask) != 0) {
-          return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
-        }
-
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-  #else
-          const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                      0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-  #endif
-          /* In this branch we handle three cases:
-            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-            We expand the input word (16-bit) into two words (32-bit), thus
-            we have room for four bytes. However, we need five distinct bit
-            layouts. Note that the last byte in cases #2 and #3 is the same.
-
-            We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-            in register t2.
-
-            We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-            either byte 1 for case #2 or byte 2 for case #3. Note that they
-            differ by exactly one bit.
-
-            Finally from these two words we build proper UTF-8 sequence, taking
-            into account the case (i.e, the number of bytes to write).
-          */
-          /**
-           * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-           * t2 => [0ccc|cccc] [10cc|cccc]
-           * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-           */
-  #define vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-          // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-          const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
-          // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-          const uint16x8_t t1 = vandq_u16(t0, vec(0b0011111101111111));
-          // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-          const uint16x8_t t2 = vorrq_u16 (t1, vec(0b1000000000000000));
-
-          // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-          const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-          // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-          const uint16x8_t s1 = vandq_u16(utf16_packed, vec(0b0000111111000000));
-          // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-          const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-          // [00bb|bbbb|0000|aaaa]
-          const uint16x8_t s2 = vorrq_u16(s0, s1s);
-          // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-          const uint16x8_t s3 = vorrq_u16(s2, vec(0b1100000011100000));
-          const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-          const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
-          const uint16x8_t m0 = vbicq_u16(vec(0b0100000000000000), one_or_two_bytes_bytemask);
-          const uint16x8_t s4 = veorq_u16(s3, m0);
-  #undef vec
-
-          // 4. expand words 16-bit => 32-bit
-          const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-          const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
-
-          // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-          const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-          const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-  #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-          const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 );
-          const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 );
-  #else
-          const uint16x8_t onemask = { 0x0001, 0x0004,
-                                      0x0010, 0x0040,
-                                      0x0100, 0x0400,
-                                      0x1000, 0x4000 };
-          const uint16x8_t twomask = { 0x0002, 0x0008,
-                                      0x0020, 0x0080,
-                                      0x0200, 0x0800,
-                                      0x2000, 0x8000 };
-  #endif
-          const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
-          const uint16_t mask = vaddvq_u16(combined);
-          // The following fast path may or may not be beneficial.
-          /*if(mask == 0) {
-            // We only have three-byte words. Use fast path.
-            const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-            const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-            const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-            vst1q_u8(utf8_output, utf8_0);
-            utf8_output += 12;
-            vst1q_u8(utf8_output, utf8_1);
-            utf8_output += 12;
-            buf += 8;
-            continue;
-          }*/
-          const uint8_t mask0 = uint8_t(mask);
-
-          const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-          const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
-
-          const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-          const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-          const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
-
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += row0[0];
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += row1[0];
-
-          buf += 8;
-      }
-    // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000)==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            } else {
+                // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+                // check for invalid input
+                const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+                const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+                const uint16x8_t forbidden_bytemask = vandq_u16(vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
+                if (vmaxvq_u16(forbidden_bytemask) != 0) {
+                    return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char*>(utf8_output));
+                }
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t dup_even = make_uint16x8_t(0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+                const uint16x8_t dup_even = { 0x0000, 0x0202, 0x0404, 0x0606,
+                    0x0808, 0x0a0a, 0x0c0c, 0x0e0e };
+#endif
+                /* In this branch we handle three cases:
+                  1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                  2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                  3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+                  We expand the input word (16-bit) into two words (32-bit), thus
+                  we have room for four bytes. However, we need five distinct bit
+                  layouts. Note that the last byte in cases #2 and #3 is the same.
+
+                  We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+                  in register t2.
+
+                  We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+                  either byte 1 for case #2 or byte 2 for case #3. Note that they
+                  differ by exactly one bit.
+
+                  Finally from these two words we build proper UTF-8 sequence, taking
+                  into account the case (i.e, the number of bytes to write).
+                */
+                /**
+                 * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+                 * t2 => [0ccc|cccc] [10cc|cccc]
+                 * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+                 */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+                // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+                const uint16x8_t t0 = vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed), vreinterpretq_u8_u16(dup_even)));
+                // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+                const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+                // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+                const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+                // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+                const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+                // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+                const uint16x8_t s1 = vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+                // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+                const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+                // [00bb|bbbb|0000|aaaa]
+                const uint16x8_t s2 = vorrq_u16(s0, s1s);
+                // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+                const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+                const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+                const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(utf16_packed, v_07ff);
+                const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+                const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+                // 4. expand words 16-bit => 32-bit
+                const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+                const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+                // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+                const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+                const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint16x8_t onemask = make_uint16x8_t(0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000);
+                const uint16x8_t twomask = make_uint16x8_t(0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000);
+#else
+                const uint16x8_t onemask = { 0x0001, 0x0004,
+                    0x0010, 0x0040,
+                    0x0100, 0x0400,
+                    0x1000, 0x4000 };
+                const uint16x8_t twomask = { 0x0002, 0x0008,
+                    0x0020, 0x0080,
+                    0x0200, 0x0800,
+                    0x2000, 0x8000 };
+#endif
+                const uint16x8_t combined = vorrq_u16(vandq_u16(one_byte_bytemask, onemask), vandq_u16(one_or_two_bytes_bytemask, twomask));
+                const uint16_t mask = vaddvq_u16(combined);
+                // The following fast path may or may not be beneficial.
+                /*if(mask == 0) {
+                  // We only have three-byte words. Use fast path.
+                  const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+                  const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+                  const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+                  vst1q_u8(utf8_output, utf8_0);
+                  utf8_output += 12;
+                  vst1q_u8(utf8_output, utf8_1);
+                  utf8_output += 12;
+                  buf += 8;
+                  continue;
+                }*/
+                const uint8_t mask0 = uint8_t(mask);
+
+                const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+                const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+                const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+                const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+                const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+                const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+                const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+                vst1q_u8(utf8_output, utf8_0);
+                utf8_output += row0[0];
+                vst1q_u8(utf8_output, utf8_1);
+                utf8_output += row1[0];
+
+                buf += 8;
+            }
+            // At least one 32-bit word will produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes.
         } else {
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output)); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char*>(utf8_output));
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char*>(utf8_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=arm64/arm_convert_utf32_to_utf16.cpp
 /* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out) {
-  uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
-  const char32_t* end = buf + len;
-
-  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
-
-  while(buf + 4 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
-
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
-
-      if (!match_system(big_endian)) {
-        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
-        #else
-        const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
-        #endif
-        utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
-    } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
-          *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_out)
+{
+    uint16_t* utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
+    const char32_t* end = buf + len;
+
+    uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
+
+    while (buf + 4 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(in) <= 0xFFFF) {
+            uint16x4_t utf16_packed = vmovn_u32(in);
+
+            const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+            const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+            forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800)), forbidden_bytemask);
+
+            if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
+#else
+                const uint8x8_t swap = { 1, 0, 3, 2, 5, 4, 7, 6 };
+#endif
+                utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
+            }
+            vst1_u16(utf16_output, utf16_packed);
+            utf16_output += 4;
+            buf += 4;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output)); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
-    }
-  }
-
-  // check for invalid input
-  if (vmaxv_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
-  }
-
-  return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
-}
-
-
-template <endianness big_endian>
-std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out) {
-  uint16_t * utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
-
-  while(buf + 4 <= end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-
-    // Check if no bits set above 16th
-    if(vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
-
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
-      if (vmaxv_u16(forbidden_bytemask) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
-      }
-
-      if (!match_system(big_endian)) {
-        #ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
-        #else
-        const uint8x8_t swap = {1, 0, 3, 2, 5, 4, 7, 6};
-        #endif
-        utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
-    } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
-          *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+            size_t forward = 3;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (!match_system(big_endian)) {
+                        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+                        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
+        }
+    }
+
+    // check for invalid input
+    if (vmaxv_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(nullptr, reinterpret_cast<char16_t*>(utf16_output));
+    }
+
+    return std::make_pair(buf, reinterpret_cast<char16_t*>(utf16_output));
+}
+
+template<endianness big_endian>
+std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_out)
+{
+    uint16_t* utf16_output = reinterpret_cast<uint16_t*>(utf16_out);
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
+
+    while (buf + 4 <= end) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(buf));
+
+        // Check if no bits set above 16th
+        if (vmaxvq_u32(in) <= 0xFFFF) {
+            uint16x4_t utf16_packed = vmovn_u32(in);
+
+            const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+            const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+            const uint16x4_t forbidden_bytemask = vand_u16(vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
+            if (vmaxv_u16(forbidden_bytemask) != 0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), reinterpret_cast<char16_t*>(utf16_output));
+            }
+
+            if (!match_system(big_endian)) {
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+                const uint8x8_t swap = make_uint8x8_t(1, 0, 3, 2, 5, 4, 7, 6);
+#else
+                const uint8x8_t swap = { 1, 0, 3, 2, 5, 4, 7, 6 };
+#endif
+                utf16_packed = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(utf16_packed), swap));
+            }
+            vst1_u16(utf16_output, utf16_packed);
+            utf16_output += 4;
+            buf += 4;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output)); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 3;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    *utf16_output++ = !match_system(big_endian) ? char16_t(word >> 8 | word << 8) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), reinterpret_cast<char16_t*>(utf16_output));
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (!match_system(big_endian)) {
+                        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+                        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
+    return std::make_pair(result(error_code::SUCCESS, buf - start), reinterpret_cast<char16_t*>(utf16_output));
 }
 /* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace arm64 {
@@ -13769,92 +16026,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace arm64 {
@@ -13863,21 +16138,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -13885,101 +16161,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -13990,51 +16257,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -14043,7 +16313,7 @@ using utf8_validation::utf8_checker;
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace arm64 {
@@ -14054,15 +16324,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -14071,97 +16342,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -14170,10 +16450,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
@@ -14181,63 +16460,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -14245,32 +16525,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -14278,258 +16557,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -14539,68 +16841,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -14608,251 +16908,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -14862,36 +17184,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -14899,64 +17222,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace arm64 {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -14964,739 +17295,1106 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 } // namespace arm64
 } // namespace simdutf
 /* end file src/generic/utf16.h */
+
+// placeholder scalars
+
 //
 // Implementation-specific overrides
 //
 namespace simdutf {
 namespace arm64 {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        return arm_detect_encodings<utf8_validation::utf8_checker>(input, length);
     } else {
-      return simdutf::encoding_type::unspecified;
+        if (implementation::validate_utf8(input, length)) {
+            return simdutf::encoding_type::UTF8;
+        } else {
+            return simdutf::encoding_type::unspecified;
+        }
     }
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8(buf,len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii(buf,len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = arm_validate_utf16<endianness::BIG>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = arm_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = arm_validate_utf32le(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  result res = arm_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    result res = arm_validate_utf32le_with_errors(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    char32_t* utf32_output) const noexcept
+{
+    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
     }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
     }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
-  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
-    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
-    const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
 
-    const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
-    const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
-    const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
 
-    const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
-    const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
 
-    size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
-    size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
-    size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
 
-    count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
-  }
-  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
-    const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
-    const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
-    size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
-    count += 4 + surrogate_count;
-  }
-  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf32_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
+    const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
+    const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+    const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input + pos));
+        const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
+        const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
+        const uint32x4_t two_bytes_bytemask = veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
+        const uint32x4_t three_bytes_bytemask = veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+
+        const uint16x8_t reduced_ascii_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
+        const uint16x8_t reduced_two_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
+        const uint16x8_t reduced_three_bytes_bytemask = vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+
+        const uint16x8_t compressed_bytemask0 = vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
+        const uint16x8_t compressed_bytemask1 = vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+
+        size_t ascii_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
+        size_t two_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
+        size_t three_bytes_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+
+        count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    }
+    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+    const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input + pos));
+        const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
+        const uint16x8_t reduced_bytemask = vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
+        const uint16x8_t compressed_bytemask = vpaddq_u16(reduced_bytemask, reduced_bytemask);
+        size_t surrogate_count = count_ones(vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
+        count += 4 + surrogate_count;
+    }
+    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf32_length_from_utf8(input, length);
 }
 
 } // namespace arm64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/arm64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/arm64/end.h
 /* begin file src/simdutf/arm64/end.h */
 /* end file src/simdutf/arm64/end.h */
 /* end file src/arm64/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_FALLBACK
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=fallback/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=fallback/implementation.cpp
 /* begin file src/fallback/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/begin.h
 /* begin file src/simdutf/fallback/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "fallback"
 // #define SIMDUTF_IMPLEMENTATION fallback
 /* end file src/simdutf/fallback/begin.h */
 
-
-
-
-
-
-
-
 namespace simdutf {
 namespace fallback {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  int out = 0;
-  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
-  if((length % 2) == 0) {
-    if(validate_utf16le(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
-  }
-  if((length % 4) == 0) {
-    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
-  }
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    int out = 0;
+    if (validate_utf8(input, length)) {
+        out |= encoding_type::UTF8;
+    }
+    if ((length % 2) == 0) {
+        if (validate_utf16le(reinterpret_cast<const char16_t*>(input), length / 2)) {
+            out |= encoding_type::UTF16_LE;
+        }
+    }
+    if ((length % 4) == 0) {
+        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
+            out |= encoding_type::UTF32_LE;
+        }
+    }
 
-  return out;
+    return out;
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
     return scalar::utf8::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
     return scalar::utf8::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
     return scalar::ascii::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
     return scalar::ascii::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
     return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
     return scalar::utf32::validate(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
     return scalar::utf32::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-   return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-   return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert_valid(input, size,  utf32_output);
+    char32_t* utf32_output) const noexcept
+{
+    return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    scalar::utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf16_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace fallback
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/fallback/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/fallback/end.h
 /* begin file src/simdutf/fallback/end.h */
 /* end file src/simdutf/fallback/end.h */
 /* end file src/fallback/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_ICELAKE
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/implementation.cpp
 /* begin file src/icelake/implementation.cpp */
 
-
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/begin.h
 /* begin file src/simdutf/icelake/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "icelake"
 // #define SIMDUTF_IMPLEMENTATION icelake
@@ -15708,7 +18406,7 @@ SIMDUTF_TARGET_ICELAKE
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
@@ -15717,10 +18415,11 @@ namespace {
 #ifndef SIMDUTF_ICELAKE_H
 #error "icelake.h must be included"
 #endif
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf8_common.inl.cpp
 /* begin file src/icelake/icelake_utf8_common.inl.cpp */
 // Common procedures for both validating and non-validating conversions from UTF-8.
-enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL};
+enum block_processing_mode { SIMDUTF_FULL,
+    SIMDUTF_TAIL };
 
 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
 using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
@@ -15736,302 +18435,329 @@ using utf8_to_utf32_result = std::pair<const char*, uint32_t*>;
     The provided in and out pointers are advanced according to how many input
     bytes have been processed, upon success.
 */
-template <block_processing_mode tail, endianness big_endian>
-simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
-  // constants
-  __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
-  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
-  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
-  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
-  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
-  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
-  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
-  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
-  __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  // Note that 'tail' is a compile-time constant !
-  __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
-  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
-  __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
-  if(_ktestc_mask64_u8(m1, b)) {// NOT(m1) AND b -- if all zeroes, then all ASCII
-  // alternatively, we could do 'if (m1 == b) { '
+template<block_processing_mode tail, endianness big_endian>
+simdutf_really_inline bool process_block_utf8_to_utf16(const char*& in, char16_t*& out, size_t gap)
+{
+    // constants
+    __m512i mask_identity = _mm512_set_epi8(63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
+    __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
+    __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
+    __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
+    __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
+    __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
+    __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
+    __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
+    __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    // Note that 'tail' is a compile-time constant !
+    __mmask64 b = (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
+    __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in) : _mm512_maskz_loadu_epi8(b, in);
+    __mmask64 m1 = (tail == SIMDUTF_FULL) ? _mm512_cmplt_epu8_mask(input, mask_80808080) : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
+    if (_ktestc_mask64_u8(m1, b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
+        // alternatively, we could do 'if (m1 == b) { '
+        if (tail == SIMDUTF_FULL) {
+            in += 64; // consumed 64 bytes
+            // we convert a full 64-byte block, writing 128 bytes.
+            __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+            if (big_endian) {
+                input1 = _mm512_shuffle_epi8(input1, byteflip);
+            }
+            _mm512_storeu_si512(out, input1);
+            out += 32;
+            __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+            if (big_endian) {
+                input2 = _mm512_shuffle_epi8(input2, byteflip);
+            }
+            _mm512_storeu_si512(out, input2);
+            out += 32;
+            return true; // we are done
+        } else {
+            in += gap;
+            if (gap <= 32) {
+                __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+                if (big_endian) {
+                    input1 = _mm512_shuffle_epi8(input1, byteflip);
+                }
+                _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
+                out += gap;
+            } else {
+                __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+                if (big_endian) {
+                    input1 = _mm512_shuffle_epi8(input1, byteflip);
+                }
+                _mm512_storeu_si512(out, input1);
+                out += 32;
+                __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+                if (big_endian) {
+                    input2 = _mm512_shuffle_epi8(input2, byteflip);
+                }
+                _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
+                out += gap - 32;
+            }
+            return true; // we are done
+        }
+    }
+    // classify characters further
+    __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
+        _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
+    __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
+        _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
+
+    __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
+        _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
+                        // Overlong 2-byte sequence
+    if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
+        // Overlong 2-byte sequence
+        return false;
+    }
+    if (_ktestz_mask64_u8(m34, m34) == 0) {
+        // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
+        __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
+            _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
+
+        __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
+
+        __mmask64 mp1 = _kshiftli_mask64(m234, 1);
+        __mmask64 mp2 = _kshiftli_mask64(m34, 2);
+        // We could do it as follows...
+        // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
+        // but GCC generates better code when we do:
+        if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
+            // Fast path with 1,2,3 bytes
+            __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
+            __mmask64 m1234 = _kor_mask64(m1, m234);
+            // mismatched continuation bytes:
+            if (tail == SIMDUTF_FULL) {
+                __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+                // the presence of a 1 bit indicates that they overlap.
+                // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
+                if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+                    return false;
+                }
+            } else {
+                __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+                if (mc != bxorm1234) {
+                    return false;
+                }
+            }
+            // mend: identifying the last bytes of each sequence to be decoded
+            __mmask64 mend = _kshiftri_mask64(m1234, 1);
+            if (tail != SIMDUTF_FULL) {
+                mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
+            }
+
+            __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+            __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+            __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+            __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
+            __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
+                clearedbytes); // the last byte of each character
+
+            __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+            __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+            __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+            __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
+                beforeasciibytes); // the second last bytes (of two, three byte seq,
+                                   // surrogates)
+            secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+
+            __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
+                indexofsecondlastbytes); // indices of the second last bytes
+            __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
+                clearedbytes); // only those that are the third last byte of a sequece
+            __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
+                thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                                // surrogate)
+            thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+            __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
+            // the elements of Wout excluding the last element if it happens to be a high surrogate:
+
+            __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
+
+            // Encodings out of range...
+            {
+                // the location of 3-byte sequence start bytes in the input
+                __mmask64 m3 = m34 & (b ^ m4);
+                // words in Wout corresponding to 3-byte sequences.
+                __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+                __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+                __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+                __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+                __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+                __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+                if (_kor_mask32(Msmall800, M3s)) {
+                    return false;
+                }
+            }
+            int64_t nout = _mm_popcnt_u64(mprocessed);
+            in += 64 - _lzcnt_u64(mprocessed);
+            if (big_endian) {
+                Wout = _mm512_shuffle_epi8(Wout, byteflip);
+            }
+            _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+            out += nout;
+            return true; // ok
+        }
+        //
+        // We have a 4-byte sequence, this is the general case.
+        // Slow!
+        __mmask64 mp3 = _kshiftli_mask64(m4, 3);
+        __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
+        __mmask64 m1234 = _kor_mask64(m1, m234);
+
+        // mend: identifying the last bytes of each sequence to be decoded
+        __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
+        if (tail != SIMDUTF_FULL) {
+            mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
+        }
+        __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+        __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+
+        __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+        __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input); // high two bits cleared where not ASCII
+        __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
+            clearedbytes); // the last byte of each character
+
+        __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+        __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+        __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+        __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
+            beforeasciibytes); // the second last bytes (of two, three byte seq,
+                               // surrogates)
+        secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+
+        __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
+            indexofsecondlastbytes); // indices of the second last bytes
+        __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
+            clearedbytes); // only those that are the third last byte of a sequece
+        __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
+            thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                            // surrogate)
+        thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+        __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
+        uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
+        __mmask32 Mlo = __mmask32(Mlo_uint64);
+        __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
+        __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
+            mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
+        __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
+            4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
+        __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
+            lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
+        __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
+            mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
+        // the elements of Wout excluding the last element if it happens to be a high surrogate:
+        __mmask32 Mout = ~(Mhi & 0x80000000);
+        __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
+
+        // mismatched continuation bytes:
+        if (tail == SIMDUTF_FULL) {
+            __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+            // the presence of a 1 bit indicates that they overlap.
+            // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
+            if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+                return false;
+            }
+        } else {
+            __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+            if (mc != bxorm1234) {
+                return false;
+            }
+        }
+        // Encodings out of range...
+        {
+            // the location of 3-byte sequence start bytes in the input
+            __mmask64 m3 = m34 & (b ^ m4);
+            // words in Wout corresponding to 3-byte sequences.
+            __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+            __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+            __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+            __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+            __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+            __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+            __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
+            __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
+            if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
+                return false;
+            }
+        }
+        in += 64 - _lzcnt_u64(mprocessed);
+        int64_t nout = _mm_popcnt_u64(mprocessed);
+        if (big_endian) {
+            Wout = _mm512_shuffle_epi8(Wout, byteflip);
+        }
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+        out += nout;
+        return true; // ok
+    }
+    // Fast path 2: all ASCII or 2 byte
+    __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
+    // on top of -0xc0 we substract -2 which we get back later of the
+    // continuation byte tags
+    __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
+    __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
     if (tail == SIMDUTF_FULL) {
-      in += 64;          // consumed 64 bytes
-      // we convert a full 64-byte block, writing 128 bytes.
-      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-      if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
-      _mm512_storeu_si512(out, input1);
-      out += 32;
-      __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-      if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
-      _mm512_storeu_si512(out, input2);
-      out += 32;
-      return true; // we are done
+        __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
+        if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
+            return false;
+        }
     } else {
-      in += gap;
-      if (gap <= 32) {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1), input1);
-        out += gap;
-      } else {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if(big_endian) { input1 = _mm512_shuffle_epi8(input1, byteflip); }
-        _mm512_storeu_si512(out, input1);
-        out += 32;
-        __m512i input2 = _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-        if(big_endian) { input2 = _mm512_shuffle_epi8(input2, byteflip); }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
-        out += gap - 32;
-      }
-      return true; // we are done
-    }
-  }
-  // classify characters further
-  __mmask64 m234 = _mm512_cmp_epu8_mask(mask_c0c0c0c0, input,
-                                        _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
-  __mmask64 m34 = _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
-                                       _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
-
-  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(m234, input, mask_c2c2c2c2,
-                                                     _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
-                                                                     // Overlong 2-byte sequence
-  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
-    // Overlong 2-byte sequence
-    return false;
-  }
-  if (_ktestz_mask64_u8(m34, m34) == 0) {
-    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a 4-byte sequence!
-    __mmask64 m4 = _mm512_cmp_epu8_mask(input, mask_f0f0f0f0,
-                                        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
-
-    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m1) : _kand_mask64(_knot_mask64(m1), b);
-
-    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
-    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
-    // We could do it as follows...
-    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
-    // but GCC generates better code when we do:
-    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and return 1 if all zeroes
-      // Fast path with 1,2,3 bytes
-      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
-      __mmask64 m1234 = _kor_mask64(m1, m234);
-      // mismatched continuation bytes:
-      if (tail == SIMDUTF_FULL) {
-        __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-        // the presence of a 1 bit indicates that they overlap.
-        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
-        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
-      } else {
-        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-        if (mc != bxorm1234) { return false; }
-      }
-      // mend: identifying the last bytes of each sequence to be decoded
-      __mmask64 mend = _kshiftri_mask64(m1234, 1);
-      if (tail != SIMDUTF_FULL) {
-        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
-      }
-
-
-      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-      __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-      __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-      __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
-      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
-                                                        clearedbytes); // the last byte of each character
-
-      __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
-      __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-      __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
-                                                              beforeasciibytes); // the second last bytes (of two, three byte seq,
-                                                                                 // surrogates)
-      secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
-
-      __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
-                                                       indexofsecondlastbytes); // indices of the second last bytes
-      __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
-                                                    clearedbytes); // only those that are the third last byte of a sequece
-      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
-                                                             thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                                                                             // surrogate)
-      thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
-      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
-      // the elements of Wout excluding the last element if it happens to be a high surrogate:
-
-      __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(0xFFFFFFFF, mend) : _pdep_u64(0xFFFFFFFF, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
-
-
-      // Encodings out of range...
-      {
-        // the location of 3-byte sequence start bytes in the input
-        __mmask64 m3 = m34 & (b ^ m4);
-        // words in Wout corresponding to 3-byte sequences.
-        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-        __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-        __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-        if (_kor_mask32(Msmall800, M3s)) { return false; }
-      }
-      int64_t nout = _mm_popcnt_u64(mprocessed);
-      in +=  64 - _lzcnt_u64(mprocessed);
-      if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
-      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-      out += nout;
-      return true; // ok
+        __mmask64 bxorleading = _kxor_mask64(b, leading);
+        if (_kshiftli_mask64(m234, 1) != bxorleading) {
+            return false;
+        }
     }
     //
-    // We have a 4-byte sequence, this is the general case.
-    // Slow!
-    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
-    __mmask64 mc = _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
-    __mmask64 m1234 = _kor_mask64(m1, m234);
-
-    // mend: identifying the last bytes of each sequence to be decoded
-    __mmask64 mend = _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
-    if (tail != SIMDUTF_FULL) {
-      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
-    }
-    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-    __m512i last_and_thirdu16 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-    __m512i nonasciitags = _mm512_maskz_mov_epi8(mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-    __m512i clearedbytes = _mm512_andnot_si512(nonasciitags, input);             // high two bits cleared where not ASCII
-    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, last_and_thirdu16,
-                                                      clearedbytes); // the last byte of each character
-
-    __mmask64 mask_before_non_ascii = _kshiftri_mask64(mask_not_ascii, 1);               // bytes that precede non-ASCII bytes
-    __m512i indexofsecondlastbytes = _mm512_add_epi16(mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-    __m512i beforeasciibytes = _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofsecondlastbytes,
-                                                            beforeasciibytes); // the second last bytes (of two, three byte seq,
-                                                                               // surrogates)
-    secondlastbytes = _mm512_slli_epi16(secondlastbytes, 6);                   // shifted into position
-
-    __m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
-                                                     indexofsecondlastbytes); // indices of the second last bytes
-    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
-                                                  clearedbytes); // only those that are the third last byte of a sequece
-    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
-                                                           thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                                                                           // surrogate)
-    thirdlastbytes = _mm512_slli_epi16(thirdlastbytes, 12);                // shifted into position
-    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes, thirdlastbytes, 254);
-    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
-    __mmask32 Mlo = __mmask32(Mlo_uint64);
-    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
-    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(Mlo,
-                                                  mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
-    __m512i shifted4_thirdsecondandlastbytes = _mm512_srli_epi16(thirdsecondandlastbytes,
-                                                                 4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
-    __m512i tagged_lo_surrogates = _mm512_or_si512(thirdsecondandlastbytes,
-                                                   lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
-    __m512i Wout = _mm512_mask_add_epi16(tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
-                                         mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
-    // the elements of Wout excluding the last element if it happens to be a high surrogate:
-    __mmask32 Mout = ~(Mhi & 0x80000000);
-    __mmask64 mprocessed = (tail == SIMDUTF_FULL) ? _pdep_u64(Mout, mend) : _pdep_u64(Mout, _kand_mask64(mend, b)); // we adjust mend at the end of the output.
-
-
-    // mismatched continuation bytes:
     if (tail == SIMDUTF_FULL) {
-      __mmask64 xnormcm1234 = _kxnor_mask64(mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-      // the presence of a 1 bit indicates that they overlap.
-      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1 if all zeroes.
-      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) { return false; }
+        // In the two-byte/ASCII scenario, we are easily latency bound, so we want
+        // to increment the input buffer as quickly as possible.
+        // We process 32 bytes unless the byte at index 32 is a continuation byte,
+        // in which case we include it as well for a total of 33 bytes.
+        // Note that if x is an ASCII byte, then the following is false:
+        // int8_t(x) <= int8_t(0xc0) under two's complement.
+        in += 32;
+        if (int8_t(*in) <= int8_t(0xc0))
+            in++;
+        // The alternative is to do
+        // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+        // but it requires loading the input, doing the mask computation, and converting
+        // back the mask to a general register. It just takes too long, leaving the
+        // processor likely to be idle.
     } else {
-      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-      if (mc != bxorm1234) { return false; }
-    }
-    // Encodings out of range...
-    {
-      // the location of 3-byte sequence start bytes in the input
-      __mmask64 m3 = m34 & (b ^ m4);
-      // words in Wout corresponding to 3-byte sequences.
-      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-      __mmask32 Msmall800 = _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-      __mmask32 M3s = _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
-      __mmask32 M4s = _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
-      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) { return false; }
-    }
-    in += 64 - _lzcnt_u64(mprocessed);
-    int64_t nout = _mm_popcnt_u64(mprocessed);
-    if(big_endian) { Wout = _mm512_shuffle_epi8(Wout, byteflip); }
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-    out += nout;
-    return true; // ok
-  }
-  // Fast path 2: all ASCII or 2 byte
-  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
-  // on top of -0xc0 we substract -2 which we get back later of the
-  // continuation byte tags
-  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
-  __mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
-  if (tail == SIMDUTF_FULL) {
-    __mmask64 xnor234leading = _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
-    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) { return false; }
-  } else {
-    __mmask64 bxorleading = _kxor_mask64(b, leading);
-    if (_kshiftli_mask64(m234, 1) != bxorleading) { return false; }
-  }
-  //
-  if (tail == SIMDUTF_FULL) {
-    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
-    // to increment the input buffer as quickly as possible.
-    // We process 32 bytes unless the byte at index 32 is a continuation byte,
-    // in which case we include it as well for a total of 33 bytes.
-    // Note that if x is an ASCII byte, then the following is false:
-    // int8_t(x) <= int8_t(0xc0) under two's complement.
-    in += 32;
-    if(int8_t(*in) <= int8_t(0xc0)) in++;
-    // The alternative is to do
-    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-    // but it requires loading the input, doing the mask computation, and converting
-    // back the mask to a general register. It just takes too long, leaving the
-    // processor likely to be idle.
-  } else {
-    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-  }
-  __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte);          // will contain zero for ascii, and the data
-  lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead));                 // ... zero extended into words
-  __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
-  follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow));             // ... zero extended into words
-  lead = _mm512_slli_epi16(lead, 6);                                         // shifted into position
-  __m512i final = _mm512_add_epi16(follow, lead);                            // combining lead and follow
-
-  if(big_endian) { final = _mm512_shuffle_epi8(final, byteflip); }
-  if (tail == SIMDUTF_FULL) {
-    // Next part is UTF-16 specific and can be generalized to UTF-32.
-    int nout = _mm_popcnt_u32(uint32_t(leading));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-  } else {
-    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-  }
-
-  return true; // we are fine.
-}
-
+        in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+    }
+    __m512i lead = _mm512_maskz_compress_epi8(leading, leading2byte); // will contain zero for ascii, and the data
+    lead = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(lead)); // ... zero extended into words
+    __m512i follow = _mm512_maskz_compress_epi8(continuation_or_ascii, input); // the last bytes of each sequence
+    follow = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(follow)); // ... zero extended into words
+    lead = _mm512_slli_epi16(lead, 6); // shifted into position
+    __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
 
+    if (big_endian) {
+        final = _mm512_shuffle_epi8(final, byteflip);
+    }
+    if (tail == SIMDUTF_FULL) {
+        // Next part is UTF-16 specific and can be generalized to UTF-32.
+        int nout = _mm_popcnt_u32(uint32_t(leading));
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+        out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+    } else {
+        int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+        out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+    }
 
+    return true; // we are fine.
+}
 
 /*
     utf32_to_utf16_masked converts `count` lower UTF-32 words
@@ -16054,8 +18780,9 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t
     We pass it to the (always inlined) function to encourage the compiler to
     keep the value in a (constant) register.
 */
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output)
+{
 
     const __mmask16 valid = uint16_t((1 << count) - 1);
     // 1. check if we have any surrogate pairs
@@ -16063,11 +18790,11 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
     const __mmask16 sp_mask = _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
 
     if (sp_mask == 0) {
-        if(big_endian) {
-          _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
+        if (big_endian) {
+            _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
 
         } else {
-          _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
+            _mm256_mask_storeu_epi16((__m256i*)output, valid, _mm512_cvtepi32_epi16(utf32));
         }
         return count;
     }
@@ -16097,12 +18824,14 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
         // Here we want to trim all of the upper 16-bit words from the 2-byte
         // characters represented as 4-byte values. We can compute it from
         // sp_mask or the following... It can be more optimized!
-        const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-        const  __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2*count)) - 1));
-        if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
+        const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+        const __mmask32 nonzero_masked = _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
+        if (big_endian) {
+            t5 = _mm512_shuffle_epi8(t5, byteflip);
+        }
         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
         __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
-        _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
+        _mm512_mask_storeu_epi16(output, (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
         //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
     }
 
@@ -16129,18 +18858,19 @@ simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip, __m51
     We pass it to the (always inlined) function to encourage the compiler to
     keep the value in a (constant) register.
 */
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf32, unsigned int count, char16_t* output)
+{
     // check if we have any surrogate pairs
     const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
     const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
     if (sp_mask == 0) {
         // technically, it should be _mm256_storeu_epi16
-        if(big_endian) {
-          _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),_mm512_castsi512_si256(byteflip)));
+        if (big_endian) {
+            _mm256_storeu_si256((__m256i*)output, _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32), _mm512_castsi512_si256(byteflip)));
         } else {
-          _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
+            _mm256_storeu_si256((__m256i*)output, _mm512_cvtepi32_epi16(utf32));
         }
         return count;
     }
@@ -16167,11 +18897,13 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3
         const __m512i t3 = _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
         const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
         __m512i t5 = _mm512_ror_epi32(t4, 16);
-        const  __mmask32 nonzero = _kor_mask32(0xaaaaaaaa,_mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-        if(big_endian) { t5 = _mm512_shuffle_epi8(t5, byteflip); }
+        const __mmask32 nonzero = _kor_mask32(0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+        if (big_endian) {
+            t5 = _mm512_shuffle_epi8(t5, byteflip);
+        }
         // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability (zen4)
         __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
-        _mm512_mask_storeu_epi16(output, (1<<(count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
+        _mm512_mask_storeu_epi16(output, (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1, compressed);
         //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
     }
 
@@ -16181,21 +18913,23 @@ simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip, __m512i utf3
 /**
  * Store the last N bytes of previous followed by 512-N bytes from input.
  */
-template <int N>
-__m512i prev(__m512i input, __m512i previous) {
-    static_assert(N<=32, "N must be no larger than 32");
-    const __m512i movemask = _mm512_setr_epi32(28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11);
+template<int N>
+__m512i prev(__m512i input, __m512i previous)
+{
+    static_assert(N <= 32, "N must be no larger than 32");
+    const __m512i movemask = _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
     const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
 #if SIMDUTF_GCC8 || SIMDUTF_GCC9
-    constexpr int shift = 16-N; // workaround for GCC8,9
+    constexpr int shift = 16 - N; // workaround for GCC8,9
     return _mm512_alignr_epi8(input, rotated, shift);
 #else
-    return _mm512_alignr_epi8(input, rotated, 16-N);
+    return _mm512_alignr_epi8(input, rotated, 16 - N);
 #endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
 }
 
-template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
-__m512i shuffle_epi128(__m512i v) {
+template<unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
+__m512i shuffle_epi128(__m512i v)
+{
     static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
     static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
     static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
@@ -16205,16 +18939,18 @@ __m512i shuffle_epi128(__m512i v) {
     return _mm512_shuffle_i32x4(v, v, shuffle);
 }
 
-template <unsigned idx>
-constexpr __m512i broadcast_epi128(__m512i v) {
+template<unsigned idx>
+constexpr __m512i broadcast_epi128(__m512i v)
+{
     return shuffle_epi128<idx, idx, idx, idx>(v);
 }
 
 /**
  * Current unused.
  */
-template <int N>
-__m512i rotate_by_N_epi8(const __m512i input) {
+template<int N>
+__m512i rotate_by_N_epi8(const __m512i input)
+{
 
     // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
     const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
@@ -16230,7 +18966,8 @@ __m512i rotate_by_N_epi8(const __m512i input) {
     0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
     corresponding bytes during pshufb.
 */
-simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
+simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8)
+{
     /*
         Input:
         - utf8: bytes stored at separate 32-bit words
@@ -16319,8 +19056,7 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
             0x0707070707070707,
             0x0b0a090900000000,
             0x0707070707070707,
-            0x0b0a090900000000
-        );
+            0x0b0a090900000000);
 
         const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
         values = _mm512_sllv_epi32(values, shift);
@@ -16341,8 +19077,7 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
             0x1919191919191919,
             0x0b10151500000000,
             0x1919191919191919,
-            0x0b10151500000000
-        );
+            0x0b10151500000000);
 
         const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
         values = _mm512_srlv_epi32(values, shift);
@@ -16351,29 +19086,29 @@ simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i
     return values;
 }
 
-
-simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int &count) {
+simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1, int& count)
+{
     const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
     const __m512i expand_ver2 = _mm512_setr_epi64(
-                0x0403020103020100,
-                0x0605040305040302,
-                0x0807060507060504,
-                0x0a09080709080706,
-                0x0c0b0a090b0a0908,
-                0x0e0d0c0b0d0c0b0a,
-                0x000f0e0d0f0e0d0c,
-                0x0201000f01000f0e
-    );
+        0x0403020103020100,
+        0x0605040305040302,
+        0x0807060507060504,
+        0x0a09080709080706,
+        0x0c0b0a090b0a0908,
+        0x0e0d0c0b0d0c0b0a,
+        0x000f0e0d0f0e0d0c,
+        0x0201000f01000f0e);
     const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
     const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
     const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
     const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
     const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
     count = static_cast<int>(count_ones(leading_bytes));
-    return  _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
+    return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, input);
 }
 
-simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
+simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input)
+{
     __m512i char_class = _mm512_srli_epi32(input, 4);
     /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
     const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
@@ -16382,7 +19117,7 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
     return expanded_utf8_to_utf32(char_class, input);
 }
 /* end file src/icelake/icelake_utf8_common.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_macros.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_macros.inl.cpp
 /* begin file src/icelake/icelake_macros.inl.cpp */
 
 /*
@@ -16426,99 +19161,97 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
         ]
 */
 
-#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                    \
-        {                                                                                                    \
-            const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                              \
-            const __m512i expand_ver2 = _mm512_setr_epi64(                                                   \
-                0x0403020103020100,                                                                          \
-                0x0605040305040302,                                                                          \
-                0x0807060507060504,                                                                          \
-                0x0a09080709080706,                                                                          \
-                0x0c0b0a090b0a0908,                                                                          \
-                0x0e0d0c0b0d0c0b0a,                                                                          \
-                0x000f0e0d0f0e0d0c,                                                                          \
-                0x0201000f01000f0e                                                                           \
-            );                                                                                               \
-            const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                  \
-                                                                                                             \
-            __mmask16 leading_bytes;                                                                         \
-            const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                             \
-            const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                         \
-            const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                             \
-            leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                       \
-                                                                                                             \
-            __m512i char_class;                                                                              \
-            char_class = _mm512_srli_epi32(input, 4);                                                        \
-            /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                           \
-            const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                             \
-            const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                       \
-            char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);              \
-                                                                                                             \
-            const int valid_count = static_cast<int>(count_ones(leading_bytes));                             \
-            const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                 \
-                                                                                                             \
-            const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);    \
-                                                                                                             \
-            if (UTF32) {                                                                                     \
-                if(MASKED) {                                                                                 \
-                    const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                \
-                    _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                  \
-                } else {                                                                                     \
-                    _mm512_storeu_si512((__m512i*)output, out);                                              \
-                }                                                                                            \
-                output += valid_count;                                                                       \
-            } else {                                                                                         \
-                if(MASKED) {                                                                                 \
-                    output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
-                } else {                                                                                     \
-                    output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t *>(output));        \
-                }                                                                                            \
-            }                                                                                                \
-        }
-
-#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                    \
-{                                                                                                           \
-    if (UTF32) {                                                                                            \
-        if(MASKED) {                                                                                        \
-            const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                  \
-            _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                  \
-        } else {                                                                                            \
-            _mm512_storeu_si512((__m512i*)output, INPUT);                                              \
-        }                                                                                                   \
-        output += VALID_COUNT;                                                                              \
-    } else {                                                                                                \
-        if(MASKED) {                                                                                        \
-            output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));      \
-        } else {                                                                                            \
-            output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t *>(output));             \
-        }                                                                                                   \
-    }                                                                                                       \
-}
-
-
-#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                  \
-        if (UTF32) {                                                                      \
-                const __m128i t0 = _mm512_castsi512_si128(utf8);                          \
-                const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                    \
-                const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                    \
-                const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                    \
-                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi32(t0)); \
-                _mm512_storeu_si512((__m512i*)(output + 1*16), _mm512_cvtepu8_epi32(t1)); \
-                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi32(t2)); \
-                _mm512_storeu_si512((__m512i*)(output + 3*16), _mm512_cvtepu8_epi32(t3)); \
-        } else {                                                                          \
-                const __m256i h0 = _mm512_castsi512_si256(utf8);                          \
-                const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                    \
-                if(big_endian) {                                                          \
-                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
-                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
-                } else {                                                                  \
-                _mm512_storeu_si512((__m512i*)(output + 0*16), _mm512_cvtepu8_epi16(h0)); \
-                _mm512_storeu_si512((__m512i*)(output + 2*16), _mm512_cvtepu8_epi16(h1)); \
-                }                                                                         \
-        }
+#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                                                                     \
+    {                                                                                                                         \
+        const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);                                                   \
+        const __m512i expand_ver2 = _mm512_setr_epi64(                                                                        \
+            0x0403020103020100,                                                                                               \
+            0x0605040305040302,                                                                                               \
+            0x0807060507060504,                                                                                               \
+            0x0a09080709080706,                                                                                               \
+            0x0c0b0a090b0a0908,                                                                                               \
+            0x0e0d0c0b0d0c0b0a,                                                                                               \
+            0x000f0e0d0f0e0d0c,                                                                                               \
+            0x0201000f01000f0e);                                                                                              \
+        const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);                                                       \
+                                                                                                                              \
+        __mmask16 leading_bytes;                                                                                              \
+        const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                                                                  \
+        const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                                                              \
+        const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                                                                  \
+        leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                                                            \
+                                                                                                                              \
+        __m512i char_class;                                                                                                   \
+        char_class = _mm512_srli_epi32(input, 4);                                                                             \
+        /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                                                                \
+        const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                                                                  \
+        const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                                                            \
+        char_class = _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);                                   \
+                                                                                                                              \
+        const int valid_count = static_cast<int>(count_ones(leading_bytes));                                                  \
+        const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);                                                      \
+                                                                                                                              \
+        const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes, utf32);                         \
+                                                                                                                              \
+        if (UTF32) {                                                                                                          \
+            if (MASKED) {                                                                                                     \
+                const __mmask16 valid = uint16_t((1 << valid_count) - 1);                                                     \
+                _mm512_mask_storeu_epi32((__m512i*)output, valid, out);                                                       \
+            } else {                                                                                                          \
+                _mm512_storeu_si512((__m512i*)output, out);                                                                   \
+            }                                                                                                                 \
+            output += valid_count;                                                                                            \
+        } else {                                                                                                              \
+            if (MASKED) {                                                                                                     \
+                output += utf32_to_utf16_masked<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t*>(output)); \
+            } else {                                                                                                          \
+                output += utf32_to_utf16<big_endian>(byteflip, out, valid_count, reinterpret_cast<char16_t*>(output));        \
+            }                                                                                                                 \
+        }                                                                                                                     \
+    }
+
+#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)                                                        \
+    {                                                                                                                           \
+        if (UTF32) {                                                                                                            \
+            if (MASKED) {                                                                                                       \
+                const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);                                                  \
+                _mm512_mask_storeu_epi32((__m512i*)output, valid_mask, INPUT);                                                  \
+            } else {                                                                                                            \
+                _mm512_storeu_si512((__m512i*)output, INPUT);                                                                   \
+            }                                                                                                                   \
+            output += VALID_COUNT;                                                                                              \
+        } else {                                                                                                                \
+            if (MASKED) {                                                                                                       \
+                output += utf32_to_utf16_masked<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t*>(output)); \
+            } else {                                                                                                            \
+                output += utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT, reinterpret_cast<char16_t*>(output));        \
+            }                                                                                                                   \
+        }                                                                                                                       \
+    }
+
+#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                                                               \
+    if (UTF32) {                                                                                                       \
+        const __m128i t0 = _mm512_castsi512_si128(utf8);                                                               \
+        const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                                                         \
+        const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                                                         \
+        const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                                                         \
+        _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_cvtepu8_epi32(t0));                                    \
+        _mm512_storeu_si512((__m512i*)(output + 1 * 16), _mm512_cvtepu8_epi32(t1));                                    \
+        _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_cvtepu8_epi32(t2));                                    \
+        _mm512_storeu_si512((__m512i*)(output + 3 * 16), _mm512_cvtepu8_epi32(t3));                                    \
+    } else {                                                                                                           \
+        const __m256i h0 = _mm512_castsi512_si256(utf8);                                                               \
+        const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                                                         \
+        if (big_endian) {                                                                                              \
+            _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip)); \
+            _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip)); \
+        } else {                                                                                                       \
+            _mm512_storeu_si512((__m512i*)(output + 0 * 16), _mm512_cvtepu8_epi16(h0));                                \
+            _mm512_storeu_si512((__m512i*)(output + 2 * 16), _mm512_cvtepu8_epi16(h1));                                \
+        }                                                                                                              \
+    }
 /* end file src/icelake/icelake_macros.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_from_valid_utf8.inl.cpp
 /* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
 // file included directly
 
@@ -16539,23 +19272,23 @@ simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
     - pair.first    - the first unprocessed input byte
     - pair.second   - the first unprocessed output word
 */
-template <endianness big_endian, typename OUTPUT>
-std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
+template<endianness big_endian, typename OUTPUT>
+std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords)
+{
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
     static_assert(!(UTF32 and big_endian), "we do not currently support big-endian UTF-32");
 
     __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
     const char* ptr = str;
     const char* end = ptr + len;
 
@@ -16570,7 +19303,7 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-        if(ascii == 0) {
+        if (ascii == 0) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16584,8 +19317,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if(valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+        if (valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -16603,8 +19336,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if(valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
+        if (valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -16614,14 +19347,14 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4*16;
+        ptr += 4 * 16;
     }
 
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-        if(ascii == 0) {
+        if (ascii == 0) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16633,8 +19366,8 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if(valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+            if (valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -16648,22 +19381,21 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3*16;
+            ptr += 3 * 16;
         }
     }
-    return {ptr, output};
+    return { ptr, output };
 }
 
-
 using utf8_to_utf16_result = std::pair<const char*, char16_t*>;
 /* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf8_validation.inl.cpp
 /* begin file src/icelake/icelake_utf8_validation.inl.cpp */
 // file included directly
 
-
-simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1) {
-  __m512i mask1 = _mm512_setr_epi64(
+simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i prev1)
+{
+    __m512i mask1 = _mm512_setr_epi64(
         0x0202020202020202,
         0x4915012180808080,
         0x0202020202020202,
@@ -16685,7 +19417,7 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0xcbcbdbcbcbcbcbcb,
         0xcbcbcb8b8383a3e7,
         0xcbcbdbcbcbcbcbcb);
-     __m512i index2 = _mm512_and_si512(prev1, v_0f);
+    __m512i index2 = _mm512_and_si512(prev1, v_0f);
 
     __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
     __m512i mask3 = _mm512_setr_epi64(
@@ -16696,19 +19428,19 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0x101010101010101,
         0x1010101babaaee6,
         0x101010101010101,
-        0x1010101babaaee6
-    );
+        0x1010101babaaee6);
     __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
     __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
     return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
-  }
+}
 
-  simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
-      const __m512i prev_input, const __m512i sc) {
+simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
+    const __m512i prev_input, const __m512i sc)
+{
     __m512i prev2 = prev<2>(input, prev_input);
     __m512i prev3 = prev<3>(input, prev_input);
-    __m512i is_third_byte  = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u-1)); // Only 111_____ will be > 0
-    __m512i is_fourth_byte  = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u-1)); // Only 1111____ will be > 0
+    __m512i is_third_byte = _mm512_subs_epu8(prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
+    __m512i is_fourth_byte = _mm512_subs_epu8(prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
     __m512i is_third_or_fourth_byte = _mm512_or_si512(is_third_byte, is_fourth_byte);
     const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
     is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
@@ -16716,13 +19448,14 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
     const __m512i v_80 = _mm512_set1_epi8(char(0x80));
     return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc, 0b1101010);
     //__m512i is_third_or_fourth_byte_mask = _mm512_and_si512(is_third_or_fourth_byte, v_80);
-    //return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
-  }
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline __m512i is_incomplete(const __m512i input) {
+    // return _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
+}
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline __m512i is_incomplete(const __m512i input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     __m512i max_value = _mm512_setr_epi64(
@@ -16735,59 +19468,63 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
         0xffffffffffffffff,
         0xbfdfefffffffffff);
     return _mm512_subs_epu8(input, max_value);
-  }
+}
 
-  struct avx512_utf8_checker {
+struct avx512_utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
-    __m512i error{};
+    __m512i error {};
 
     // The last input we received
-    __m512i prev_input_block{};
+    __m512i prev_input_block {};
     // Whether the last input we received was incomplete (used for ASCII fast path)
-    __m512i prev_incomplete{};
+    __m512i prev_incomplete {};
 
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      __m512i prev1 = prev<1>(input, prev_input);
-      __m512i sc = check_special_cases(input, prev1);
-      this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
+    simdutf_really_inline void check_utf8_bytes(const __m512i input, const __m512i prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        __m512i prev1 = prev<1>(input, prev_input);
+        __m512i sc = check_special_cases(input, prev1);
+        this->error = _mm512_or_si512(check_multibyte_lengths(input, prev_input, sc), this->error);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error = _mm512_or_si512(this->error, this->prev_incomplete);
     }
 
     // returns true if ASCII.
-    simdutf_really_inline bool check_next_input(const __m512i input) {
-      const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-      const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
-      if(ascii == 0) {
-        this->error = _mm512_or_si512(this->error, this->prev_incomplete);
-        return true;
-      } else {
-        this->check_utf8_bytes(input, this->prev_input_block);
-        this->prev_incomplete = is_incomplete(input);
-        this->prev_input_block = input;
-        return false;
-      }
+    simdutf_really_inline bool check_next_input(const __m512i input)
+    {
+        const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+        const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
+        if (ascii == 0) {
+            this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+            return true;
+        } else {
+            this->check_utf8_bytes(input, this->prev_input_block);
+            this->prev_incomplete = is_incomplete(input);
+            this->prev_input_block = input;
+            return false;
+        }
     }
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
+    simdutf_really_inline bool errors() const
+    {
         return _mm512_test_epi8_mask(this->error, this->error) != 0;
     }
 
-  }; // struct avx512_utf8_checker
+}; // struct avx512_utf8_checker
 /* end file src/icelake/icelake_utf8_validation.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_from_utf8.inl.cpp
 /* begin file src/icelake/icelake_from_utf8.inl.cpp */
 // file included directly
 
@@ -16800,48 +19537,56 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
  * completed. Upon error, the output is set to null.
  */
 
-template <endianness big_endian>
-utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
-  const char *const final_in = in + len;
-  bool result = true;
-  while (result) {
-    if (in + 64 <= final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
-    } else if(in < final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
-    } else { break; }
-  }
-  if(!result) { out = nullptr; }
-  return std::make_pair(in, out);
-}
-
-template <endianness big_endian>
-simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, size_t len, char16_t *out) {
-  const char *const init_in = in;
-  const char16_t *const init_out = out;
-  const char *const final_in = in + len;
-  bool  result = true;
-  while (result) {
-    if (in + 64 <= final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
-    } else if(in < final_in) {
-        result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
-    } else { break; }
-  }
-  if(!result) {
-    // rewind_and_convert_with_errors will seek a potential error from in onward,
-    // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
-    simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
-    res.count += (in - init_in);
-    return res;
-  } else {
-    return simdutf::result(error_code::SUCCESS,out - init_out);
-  }
+template<endianness big_endian>
+utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char* in, size_t len, char16_t* out)
+{
+    const char* const final_in = in + len;
+    bool result = true;
+    while (result) {
+        if (in + 64 <= final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
+        } else if (in < final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
+        } else {
+            break;
+        }
+    }
+    if (!result) {
+        out = nullptr;
+    }
+    return std::make_pair(in, out);
 }
 
+template<endianness big_endian>
+simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char* in, size_t len, char16_t* out)
+{
+    const char* const init_in = in;
+    const char16_t* const init_out = out;
+    const char* const final_in = in + len;
+    bool result = true;
+    while (result) {
+        if (in + 64 <= final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
+        } else if (in < final_in) {
+            result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
+        } else {
+            break;
+        }
+    }
+    if (!result) {
+        // rewind_and_convert_with_errors will seek a potential error from in onward,
+        // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
+        simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
+        res.count += (in - init_in);
+        return res;
+    } else {
+        return simdutf::result(error_code::SUCCESS, out - init_out);
+    }
+}
 
-template <endianness big_endian, typename OUTPUT>
-std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
+template<endianness big_endian, typename OUTPUT>
+std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords)
+{
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
@@ -16850,17 +19595,16 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     const char* ptr = str;
     const char* end = ptr + len;
     __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
     OUTPUT* output = dwords;
-    avx512_utf8_checker checker{};
+    avx512_utf8_checker checker {};
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
@@ -16869,7 +19613,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
      */
     while (ptr + 64 + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16882,8 +19626,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if(valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+        if (valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -16901,8 +19645,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if(valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
+        if (valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -16912,7 +19656,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4*16;
+        ptr += 4 * 16;
     }
     const char* validatedptr = ptr; // validated up to ptr
 
@@ -16920,7 +19664,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     // 3*16 bytes, so we may end up double-validating 16 bytes.
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
@@ -16932,8 +19676,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if(valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+            if (valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -16947,24 +19691,25 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3*16;
+            ptr += 3 * 16;
         }
-        validatedptr += 4*16;
+        validatedptr += 4 * 16;
     }
     {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
-       checker.check_next_input(utf8);
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - validatedptr)) - 1, (const __m512i*)validatedptr);
+        checker.check_next_input(utf8);
     }
     checker.check_eof();
-    if(checker.errors()) {
-        return {ptr, nullptr}; // We found an error.
+    if (checker.errors()) {
+        return { ptr, nullptr }; // We found an error.
     }
-    return {ptr, output};
+    return { ptr, output };
 }
 
 // Like validating_utf8_to_fixed_length but returns as soon as an error is identified
-template <endianness big_endian, typename OUTPUT>
-std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
+template<endianness big_endian, typename OUTPUT>
+std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords)
+{
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
     static_assert(UTF32 or UTF16, "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
@@ -16973,17 +19718,16 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     const char* ptr = str;
     const char* end = ptr + len;
     __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
     OUTPUT* output = dwords;
-    avx512_utf8_checker checker{};
+    avx512_utf8_checker checker {};
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
@@ -16992,14 +19736,14 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
      */
     while (ptr + 64 + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
             continue;
         }
-        if(checker.errors()) {
-            return {ptr, output, false}; // We found an error.
+        if (checker.errors()) {
+            return { ptr, output, false }; // We found an error.
         }
         const __m512i lane0 = broadcast_epi128<0>(utf8);
         const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -17008,8 +19752,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         const __m512i lane2 = broadcast_epi128<2>(utf8);
         int valid_count1;
         __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-        if(valid_count0 + valid_count1 <= 16) {
-            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+        if (valid_count0 + valid_count1 <= 16) {
+            vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
@@ -17027,8 +19771,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         const __m512i lane4 = _mm512_set1_epi32(tmp1);
         int valid_count3;
         __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-        if(valid_count2 + valid_count3 <= 16) {
-            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
+        if (valid_count2 + valid_count3 <= 16) {
+            vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
@@ -17038,7 +19782,7 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
             SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
         }
-        ptr += 4*16;
+        ptr += 4 * 16;
     }
     const char* validatedptr = ptr; // validated up to ptr
 
@@ -17046,12 +19790,12 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     // 3*16 bytes, so we may end up double-validating 16 bytes.
     if (ptr + 64 <= end) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        if (checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
-        } else if(checker.errors()) {
-            return {ptr, output, false}; // We found an error.
+        } else if (checker.errors()) {
+            return { ptr, output, false }; // We found an error.
         } else {
             const __m512i lane0 = broadcast_epi128<0>(utf8);
             const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -17060,8 +19804,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             const __m512i lane2 = broadcast_epi128<2>(utf8);
             int valid_count1;
             __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-            if(valid_count0 + valid_count1 <= 16) {
-                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
+            if (valid_count0 + valid_count1 <= 16) {
+                vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
                 valid_count0 += valid_count1;
                 vec0 = expand_utf8_to_utf32(vec0);
                 SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
@@ -17075,22 +19819,22 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             const __m512i lane3 = broadcast_epi128<3>(utf8);
             SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-            ptr += 3*16;
+            ptr += 3 * 16;
         }
-        validatedptr += 4*16;
+        validatedptr += 4 * 16;
     }
     {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
-       checker.check_next_input(utf8);
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - validatedptr)) - 1, (const __m512i*)validatedptr);
+        checker.check_next_input(utf8);
     }
     checker.check_eof();
-    if(checker.errors()) {
-        return {ptr, output, false}; // We found an error.
+    if (checker.errors()) {
+        return { ptr, output, false }; // We found an error.
     }
-    return {ptr, output, true};
+    return { ptr, output, true };
 }
 /* end file src/icelake/icelake_from_utf8.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf32.inl.cpp
 /* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
 // file included directly
 
@@ -17098,755 +19842,786 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
   Returns a pair: the first unprocessed byte from buf and utf32_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* end = buf + len;
-  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
-  __mmask32 carry{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (buf + 32 <= end) {
-    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
-    __m512i in = _mm512_loadu_si512((__m512i*)buf);
-    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-
-    // H - bitmask for high surrogates
-    const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
-    // H - bitmask for low surrogates
-    const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
-
-    if ((H|L)) {
-      // surrogate pair(s) in a register
-      const __mmask32 V = (L ^ (carry | (H << 1)));   // A high surrogate must be followed by low one and a low one must be preceded by a high one.
-                                                      // If valid, V should be equal to 0
-
-      if(V == 0) {
-        // valid case
-        /*
-            Input surrogate pair:
-            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
-                low surrogate      high surrogate
-        */
-        /*  1. Expand all words to 32-bit words
-            in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-        */
-        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-        const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1));
-
-        /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
-            in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-            shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-        */
-        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
-        const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
-
-        /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
-            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-        */
-        const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
-        const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H>>16), second, 10);
-
-        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
-            in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-            shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
-        */
-        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
-        const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
-        const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
-
-        const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H>>16), aligned_second, shifted_second);
-        const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H>>16), added_second, constant);
-
-        //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
-        const __mmask32 valid = ~L & 0x7fffffff;
-        // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
-        // to ease performance portability to Zen 4.
-        const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
-        const size_t howmany1 = count_ones((uint16_t)(valid));
-        _mm512_storeu_si512((__m512i *) utf32_output,  compressed_first);
-        utf32_output += howmany1;
-        const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
-        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
-        // The following could be unsafe in some cases?
-        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
-        _mm512_mask_storeu_epi32((__m512i *) utf32_output, __mmask16((1<<howmany2)-1), compressed_second);
-        utf32_output += howmany2;
-        // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
-        buf += 31;
-        carry = (H >> 30) & 0x1;
-      } else {
-        // invalid case
-        return std::make_tuple(buf+carry, utf32_output, false);
-      }
-    } else {
-      // no surrogates
-      // extend all thirty-two 16-bit words to thirty-two 32-bit words
-      _mm512_storeu_si512((__m512i *)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
-      _mm512_storeu_si512((__m512i *)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in,1)));
-      utf32_output += 32;
-      buf += 32;
-      carry = 0;
-    }
-  } // while
-  return std::make_tuple(buf+carry, utf32_output, true);
+template<endianness big_endian>
+std::tuple<const char16_t*, char32_t*, bool> convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* end = buf + len;
+    const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+    const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
+    __mmask32 carry { 0 };
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (buf + 32 <= end) {
+        // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
+        __m512i in = _mm512_loadu_si512((__m512i*)buf);
+        if (big_endian) {
+            in = _mm512_shuffle_epi8(in, byteflip);
+        }
+
+        // H - bitmask for high surrogates
+        const __mmask32 H = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
+        // H - bitmask for low surrogates
+        const __mmask32 L = _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
+
+        if ((H | L)) {
+            // surrogate pair(s) in a register
+            const __mmask32 V = (L ^ (carry | (H << 1))); // A high surrogate must be followed by low one and a low one must be preceded by a high one.
+                                                          // If valid, V should be equal to 0
+
+            if (V == 0) {
+                // valid case
+                /*
+                    Input surrogate pair:
+                    |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
+                        low surrogate      high surrogate
+                */
+                /*  1. Expand all words to 32-bit words
+                    in  |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+                */
+                const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+                const __m512i second = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+                /*  2. Shift by one 16-bit word to align low surrogates with high surrogates
+                    in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+                    shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+                */
+                const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
+                const __m512i shifted_second = _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+
+                /*  3. Align all high surrogates in first and second by shifting to the left by 10 bits
+                    |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+                */
+                const __m512i aligned_first = _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
+                const __m512i aligned_second = _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+
+                /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in, shifted and constant
+                    in      |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+                    shifted |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+                    constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
+                */
+                const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
+                const __m512i added_first = _mm512_mask_add_epi32(aligned_first, (__mmask16)H, aligned_first, shifted_first);
+                const __m512i utf32_first = _mm512_mask_add_epi32(added_first, (__mmask16)H, added_first, constant);
+
+                const __m512i added_second = _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16), aligned_second, shifted_second);
+                const __m512i utf32_second = _mm512_mask_add_epi32(added_second, (__mmask16)(H >> 16), added_second, constant);
+
+                //  5. Store all valid UTF-32 words (low surrogate positions and 32nd word are invalid)
+                const __mmask32 valid = ~L & 0x7fffffff;
+                // We deliberately do a _mm512_maskz_compress_epi32 followed by storeu_epi32
+                // to ease performance portability to Zen 4.
+                const __m512i compressed_first = _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
+                const size_t howmany1 = count_ones((uint16_t)(valid));
+                _mm512_storeu_si512((__m512i*)utf32_output, compressed_first);
+                utf32_output += howmany1;
+                const __m512i compressed_second = _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
+                const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
+                // The following could be unsafe in some cases?
+                //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
+                _mm512_mask_storeu_epi32((__m512i*)utf32_output, __mmask16((1 << howmany2) - 1), compressed_second);
+                utf32_output += howmany2;
+                // Only process 31 words, but keep track if the 31st word is a high surrogate as a carry
+                buf += 31;
+                carry = (H >> 30) & 0x1;
+            } else {
+                // invalid case
+                return std::make_tuple(buf + carry, utf32_output, false);
+            }
+        } else {
+            // no surrogates
+            // extend all thirty-two 16-bit words to thirty-two 32-bit words
+            _mm512_storeu_si512((__m512i*)(utf32_output), _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
+            _mm512_storeu_si512((__m512i*)(utf32_output) + 1, _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
+            utf32_output += 32;
+            buf += 32;
+            carry = 0;
+        }
+    } // while
+    return std::make_tuple(buf + carry, utf32_output, true);
 }
 /* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf8.inl.cpp
 /* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
 // file included directly
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
-
-  return std::make_pair(buf, utf8_output);
+std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    __m256i running_max = _mm256_setzero_si256();
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    // check for invalid input
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    return std::make_pair(buf, utf8_output);
 }
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const char32_t* start = buf;
-
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    // Check for too large input
-    const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-      // Check for illegal surrogate words
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-      }
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const char32_t* start = buf;
+
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        // Check for too large input
+        const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+        if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+            // Check for illegal surrogate words
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+            }
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf32_to_utf16.inl.cpp
 /* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
 // file included directly
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* end = buf + len;
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* end = buf + len;
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
 
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
+    // check for invalid input
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf16_output);
+    }
 
-  return std::make_pair(buf, utf16_output);
+    return std::make_pair(buf, utf16_output);
 }
 
 // Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
+template<endianness big_endian>
+std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
-
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-      }
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+            }
+
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_ascii_validation.inl.cpp
 /* begin file src/icelake/icelake_ascii_validation.inl.cpp */
 // file included directly
 
-bool validate_ascii(const char* buf, size_t len) {
-  const char* end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  __m512i running_or = _mm512_setzero_si512();
-  for (; buf + 64 <= end; buf += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
-  }
-  if(buf < end) {
-     const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end-buf)) - 1,(const __m512i*)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
-  }
-  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
+bool validate_ascii(const char* buf, size_t len)
+{
+    const char* end = buf + len;
+    const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+    __m512i running_or = _mm512_setzero_si512();
+    for (; buf + 64 <= end; buf += 64) {
+        const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
+        running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
+    }
+    if (buf < end) {
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((uint64_t(1) << (end - buf)) - 1, (const __m512i*)buf);
+        running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
+    }
+    return (_mm512_test_epi8_mask(running_or, running_or) == 0);
 }
 /* end file src/icelake/icelake_ascii_validation.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_utf32_validation.inl.cpp
 /* begin file src/icelake/icelake_utf32_validation.inl.cpp */
 // file included directly
 
-const char32_t* validate_utf32(const char32_t* buf, size_t len) {
+const char32_t* validate_utf32(const char32_t* buf, size_t len)
+{
     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
 
     const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
@@ -17854,27 +20629,27 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len) {
     __m512i currentoffsetmax = _mm512_setzero_si512();
 
     while (buf <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-      buf += 16;
-      currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
-      currentmax = _mm512_max_epu32(utf32, currentmax);
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+        buf += 16;
+        currentoffsetmax = _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
+        currentmax = _mm512_max_epu32(utf32, currentmax);
     }
 
     const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
     const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
     __m512i is_zero = _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-      return nullptr;
+        return nullptr;
     }
     is_zero = _mm512_xor_si512(_mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
     if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-      return nullptr;
+        return nullptr;
     }
 
     return buf;
 }
 /* end file src/icelake/icelake_utf32_validation.inl.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=icelake/icelake_convert_utf16_to_utf8.inl.cpp
 /* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
 // file included directly
 
@@ -17884,194 +20659,188 @@ const char32_t* validate_utf32(const char32_t* buf, size_t len) {
  * is written to 'outlen' and the function reports the number of input word
  * consumed.
  */
-template <endianness big_endian>
-size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
-                               unsigned char *outbuf, size_t *outlen) {
-  __m512i in;
-  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
-  __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  const char16_t * const inbuf_orig = inbuf;
-  const unsigned char * const outbuf_orig = outbuf;
-  size_t adjust = 0;
-  int carry = 0;
-
-  while (inlen >= 32) {
-    in = _mm512_loadu_si512(inbuf);
-    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-    inlen -= 31;
-  lastiteration:
-    inbuf += 31;
-
-  failiteration:
-    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
-      inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
-
-    if (_ktestz_mask32_u8(inmask, is234byte)) {
-      // fast path for ASCII only
-      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
-      outbuf += 31;
-      carry = 0;
-
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
-
-    const __mmask32 is12byte =
-        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
-
-    if (_ktestc_mask32_u8(is12byte, inmask)) {
-      // fast path for 1 and 2 byte only
-
-      const __m512i twobytes = _mm512_ternarylogic_epi32(
-          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
-          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
-      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
-                                 _mm512_set1_epi16(int16_t(0x80c0)));
-      const __m512i cmpmask =
-          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
-                                  _mm512_set1_epi16(0x0800));
-      const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
-      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
-      _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
-                              out);
-      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
-      carry = 0;
-
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
-    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
-
-
-    __m512i taglo = _mm512_set1_epi32(0x8080e000);
-    __m512i taghi = taglo;
-
-    const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
-    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
-        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
-    const __mmask32 losurr = _mm512_cmp_epu16_mask(
-        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
-
-    int carryout = 0;
-    if (!_kortestz_mask32_u8(hisurr, losurr)) {
-      // handle surrogates
-
-      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
-      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
-
-      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
-      taglo =
-          _mm512_mask_mov_epi32(taglo,__mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
-      taghi =
-          _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
-
-      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
-      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
-      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
-      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
-      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
-      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
-
-      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
-
-      const uint32_t  h = _cvtmask32_u32(hisurr);
-      const uint32_t  l = _cvtmask32_u32(losurr);
-      // check for mismatched surrogates
-      if ((h + h + carry) ^ l) {
-        const uint32_t lonohi = l & ~(h + h + carry);
-        const uint32_t hinolo = h & ~(l >> 1);
-        inlen = _tzcnt_u32(hinolo | lonohi);
-        inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
-        in = _mm512_maskz_mov_epi16(inmask, in);
-        adjust = (int)inlen - 31;
-        inlen = 0;
-        goto failiteration;
-      }
-    }
+template<endianness big_endian>
+size_t utf16_to_utf8_avx512i(const char16_t* inbuf, size_t inlen,
+    unsigned char* outbuf, size_t* outlen)
+{
+    __m512i in;
+    __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
+    __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    const char16_t* const inbuf_orig = inbuf;
+    const unsigned char* const outbuf_orig = outbuf;
+    size_t adjust = 0;
+    int carry = 0;
+
+    while (inlen >= 32) {
+        in = _mm512_loadu_si512(inbuf);
+        if (big_endian) {
+            in = _mm512_shuffle_epi8(in, byteflip);
+        }
+        inlen -= 31;
+    lastiteration:
+        inbuf += 31;
+
+    failiteration:
+        const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
+            inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
+
+        if (_ktestz_mask32_u8(inmask, is234byte)) {
+            // fast path for ASCII only
+            _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
+            outbuf += 31;
+            carry = 0;
+
+            if (inlen < 32) {
+                goto tail;
+            } else {
+                continue;
+            }
+        }
 
-    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff),hi);
-    carry = carryout;
+        const __mmask32 is12byte = _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
+
+        if (_ktestc_mask32_u8(is12byte, inmask)) {
+            // fast path for 1 and 2 byte only
+
+            const __m512i twobytes = _mm512_ternarylogic_epi32(
+                _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
+                _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
+            in = _mm512_mask_add_epi16(in, is234byte, twobytes,
+                _mm512_set1_epi16(int16_t(0x80c0)));
+            const __m512i cmpmask = _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
+                _mm512_set1_epi16(0x0800));
+            const __mmask64 smoosh = _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
+            const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
+            _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh), _cvtmask64_u64(smoosh))),
+                out);
+            outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
+            carry = 0;
+
+            if (inlen < 32) {
+                goto tail;
+            } else {
+                continue;
+            }
+        }
+        __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+        __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+
+        __m512i taglo = _mm512_set1_epi32(0x8080e000);
+        __m512i taghi = taglo;
+
+        const __m512i fc00masked = _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
+        const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
+            inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
+        const __mmask32 losurr = _mm512_cmp_epu16_mask(
+            fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
+
+        int carryout = 0;
+        if (!_kortestz_mask32_u8(hisurr, losurr)) {
+            // handle surrogates
+
+            __m512i los = _mm512_alignr_epi32(hi, lo, 1);
+            __m512i his = _mm512_alignr_epi32(lo, hi, 1);
+
+            const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
+            taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr), _mm512_set1_epi32(0x808080f0));
+            taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi), _mm512_set1_epi32(0x808080f0));
+
+            lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
+            hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
+            los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
+            his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
+            lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
+            hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
+
+            carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
+
+            const uint32_t h = _cvtmask32_u32(hisurr);
+            const uint32_t l = _cvtmask32_u32(losurr);
+            // check for mismatched surrogates
+            if ((h + h + carry) ^ l) {
+                const uint32_t lonohi = l & ~(h + h + carry);
+                const uint32_t hinolo = h & ~(l >> 1);
+                inlen = _tzcnt_u32(hinolo | lonohi);
+                inmask = __mmask32(0x7fffffff & ((1 << inlen) - 1));
+                in = _mm512_maskz_mov_epi16(inmask, in);
+                adjust = (int)inlen - 31;
+                inlen = 0;
+                goto failiteration;
+            }
+        }
 
-    __m512i mslo =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
+        hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
+        carry = carryout;
 
-    __m512i mshi =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
+        __m512i mslo = _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
 
-    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
-    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
+        __m512i mshi = _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
 
-    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
-    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
-    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
+        const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
+        const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
 
-    taglo =
-        _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
-    taghi =
-        _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
-    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
-    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
+        const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
+        const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
+        const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
 
+        taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte), _mm512_set1_epi32(0x80c00000));
+        taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi), _mm512_set1_epi32(0x80c00000));
+        __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
+        __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
 
-    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
-    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
+        magiclo = _mm512_mask_blend_epi32(__mmask16(outmask), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
+        magichi = _mm512_mask_blend_epi32(__mmask16(outmhi), _mm512_set1_epi32(0xffffffff),
+            _mm512_set1_epi32(0x00010101));
 
-    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
-                                     0xea); // A&B|C
-    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
-                                     0xea);
-    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
+        mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
+            0xea); // A&B|C
+        mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
+            0xea);
+        mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
 
-    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
+        mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
 
-    const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
-    const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
-    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
-    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
-    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
-    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
+        const __mmask64 wantlo = _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
+        const __mmask64 wanthi = _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
+        const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
+        const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
+        const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
+        const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
 
-    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
-    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+        uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
+        uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
 
-    _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
-    _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
-    outbuf += advlo + advhi;
-  }
-  outbuf -= adjust;
+        _mm512_mask_storeu_epi8(outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
+        _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
+        outbuf += advlo + advhi;
+    }
+    outbuf -= adjust;
 
 tail:
-  if (inlen != 0) {
-    // We must have inlen < 31.
-    inmask = _cvtu32_mask32((1 << inlen) - 1);
-    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
-    if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-    adjust = inlen - 31;
-    inlen = 0;
-    goto lastiteration;
-  }
-  *outlen = (outbuf - outbuf_orig) + adjust;
-  return ((inbuf - inbuf_orig) + adjust);
+    if (inlen != 0) {
+        // We must have inlen < 31.
+        inmask = _cvtu32_mask32((1 << inlen) - 1);
+        in = _mm512_maskz_loadu_epi16(inmask, inbuf);
+        if (big_endian) {
+            in = _mm512_shuffle_epi8(in, byteflip);
+        }
+        adjust = inlen - 31;
+        inlen = 0;
+        goto lastiteration;
+    }
+    *outlen = (outbuf - outbuf_orig) + adjust;
+    return ((inbuf - inbuf_orig) + adjust);
 }
 /* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
 
@@ -18082,141 +20851,138 @@ tail:
 namespace simdutf {
 namespace icelake {
 
-
 simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    const char *buf = input;
+implementation::detect_encodings(const char* input,
+    size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        const char* buf = input;
+
+        const char* start = buf;
+        const char* end = input + length;
+
+        bool is_utf8 = true;
+        bool is_utf16 = true;
+        bool is_utf32 = true;
+
+        int out = 0;
+
+        avx512_utf8_checker checker {};
+        __m512i currentmax = _mm512_setzero_si512();
+        while (buf + 64 <= end) {
+            __m512i in = _mm512_loadu_si512((__m512i*)buf);
+            __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+            __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+            if (surrogates) {
+                is_utf8 = false;
+
+                // Can still be either UTF-16LE or UTF-32 depending on the positions
+                // of the surrogates To be valid UTF-32, a surrogate cannot be in the
+                // two most significant bytes of any 32-bit word. On the other hand, to
+                // be valid UTF-16LE, at least one surrogate must be in the two most
+                // significant bytes of a 32-bit word since they always come in pairs in
+                // UTF-16LE. Note that we always proceed in multiple of 4 before this
+                // point so there is no offset in 32-bit words.
+
+                if ((surrogates & 0xaaaaaaaa) != 0) {
+                    is_utf32 = false;
+                    __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
+                        diff, _mm512_set1_epi16(uint16_t(0x0400)));
+                    __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+                    // high must be followed by low
+                    if ((highsurrogates << 1) != lowsurrogates) {
+                        return simdutf::encoding_type::unspecified;
+                    }
 
-    const char *start = buf;
-    const char *end = input + length;
+                    bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+                    if (ends_with_high) {
+                        buf += 31 * sizeof(char16_t); // advance only by 31 words so that we start
+                                                      // with the high surrogate on the next round.
+                    } else {
+                        buf += 32 * sizeof(char16_t);
+                    }
+                    is_utf16 = validate_utf16le(reinterpret_cast<const char16_t*>(buf),
+                        (end - buf) / sizeof(char16_t));
+                    if (!is_utf16) {
+                        return simdutf::encoding_type::unspecified;
 
-    bool is_utf8 = true;
-    bool is_utf16 = true;
-    bool is_utf32 = true;
+                    } else {
+                        return simdutf::encoding_type::UTF16_LE;
+                    }
 
-    int out = 0;
+                } else {
+                    is_utf16 = false;
+                    // Check for UTF-32
+                    if (length % 4 == 0) {
+                        const char32_t* input32 = reinterpret_cast<const char32_t*>(buf);
+                        const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + length / 4;
+                        if (validate_utf32(input32, end32 - input32)) {
+                            return simdutf::encoding_type::UTF32_LE;
+                        }
+                    }
+                    return simdutf::encoding_type::unspecified;
+                }
+                break;
+            }
+            // If no surrogate, validate under other encodings as well
 
-    avx512_utf8_checker checker{};
-    __m512i currentmax = _mm512_setzero_si512();
-    while (buf + 64 <= end) {
-      __m512i in = _mm512_loadu_si512((__m512i *)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if (surrogates) {
-        is_utf8 = false;
-
-        // Can still be either UTF-16LE or UTF-32 depending on the positions
-        // of the surrogates To be valid UTF-32, a surrogate cannot be in the
-        // two most significant bytes of any 32-bit word. On the other hand, to
-        // be valid UTF-16LE, at least one surrogate must be in the two most
-        // significant bytes of a 32-bit word since they always come in pairs in
-        // UTF-16LE. Note that we always proceed in multiple of 4 before this
-        // point so there is no offset in 32-bit words.
-
-        if ((surrogates & 0xaaaaaaaa) != 0) {
-          is_utf32 = false;
-          __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(
-              diff, _mm512_set1_epi16(uint16_t(0x0400)));
-          __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-          // high must be followed by low
-          if ((highsurrogates << 1) != lowsurrogates) {
-            return simdutf::encoding_type::unspecified;
-          }
-
-          bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-          if (ends_with_high) {
-            buf +=
-                31 *
-                sizeof(char16_t); // advance only by 31 words so that we start
-                                  // with the high surrogate on the next round.
-          } else {
-            buf += 32 * sizeof(char16_t);
-          }
-          is_utf16 = validate_utf16le(reinterpret_cast<const char16_t *>(buf),
-                                      (end - buf) / sizeof(char16_t));
-          if (!is_utf16) {
-            return simdutf::encoding_type::unspecified;
+            // UTF-32 validation
+            currentmax = _mm512_max_epu32(in, currentmax);
 
-          } else {
-            return simdutf::encoding_type::UTF16_LE;
-          }
+            // UTF-8 validation
+            checker.check_next_input(in);
 
-        } else {
-          is_utf16 = false;
-          // Check for UTF-32
-          if (length % 4 == 0) {
-            const char32_t *input32 = reinterpret_cast<const char32_t *>(buf);
-            const char32_t *end32 =
-                reinterpret_cast<const char32_t *>(start) + length / 4;
-            if (validate_utf32(input32, end32 - input32)) {
-              return simdutf::encoding_type::UTF32_LE;
-            }
-          }
-          return simdutf::encoding_type::unspecified;
+            buf += 64;
         }
-        break;
-      }
-      // If no surrogate, validate under other encodings as well
-
-      // UTF-32 validation
-      currentmax = _mm512_max_epu32(in, currentmax);
 
-      // UTF-8 validation
-      checker.check_next_input(in);
-
-      buf += 64;
-    }
+        // Check which encodings are possible
 
-    // Check which encodings are possible
+        if (is_utf8) {
+            size_t current_length = static_cast<size_t>(buf - start);
+            if (current_length != length) {
+                const __m512i utf8 = _mm512_maskz_loadu_epi8(
+                    (1ULL << (length - current_length)) - 1, (const __m512i*)buf);
+                checker.check_next_input(utf8);
+            }
+            checker.check_eof();
+            if (!checker.errors()) {
+                out |= simdutf::encoding_type::UTF8;
+            }
+        }
 
-    if (is_utf8) {
-      size_t current_length = static_cast<size_t>(buf - start);
-      if (current_length != length) {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8(
-            (1ULL << (length - current_length)) - 1, (const __m512i *)buf);
-        checker.check_next_input(utf8);
-      }
-      checker.check_eof();
-      if (!checker.errors()) {
-        out |= simdutf::encoding_type::UTF8;
-      }
-    }
+        if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (length - (buf - start)) / 2)) {
+            out |= simdutf::encoding_type::UTF16_LE;
+        }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(
-                        reinterpret_cast<const char16_t *>(buf),
-                        (length - (buf - start)) / 2)) {
-      out |= simdutf::encoding_type::UTF16_LE;
-    }
+        if (is_utf32 && (length % 4 == 0)) {
+            currentmax = _mm512_max_epu32(
+                _mm512_maskz_loadu_epi8(
+                    (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
+                    (const __m512i*)buf),
+                currentmax);
+            __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
+                _MM_CMPINT_GT);
+            if (outside_range == 0) {
+                out |= simdutf::encoding_type::UTF32_LE;
+            }
+        }
 
-    if (is_utf32 && (length % 4 == 0)) {
-      currentmax = _mm512_max_epu32(
-          _mm512_maskz_loadu_epi8(
-              (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
-              (const __m512i *)buf),
-          currentmax);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range == 0) {
-        out |= simdutf::encoding_type::UTF32_LE;
-      }
+        return out;
+    } else if (implementation::validate_utf8(input, length)) {
+        return simdutf::encoding_type::UTF8;
+    } else {
+        return simdutf::encoding_type::unspecified;
     }
-
-    return out;
-  } else if (implementation::validate_utf8(input, length)) {
-    return simdutf::encoding_type::UTF8;
-  } else {
-    return simdutf::encoding_type::unspecified;
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-    avx512_utf8_checker checker{};
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    avx512_utf8_checker checker {};
     const char* ptr = buf;
     const char* end = ptr + len;
     for (; ptr + 64 <= end; ptr += 64) {
@@ -18224,969 +20990,1163 @@ simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t l
         checker.check_next_input(utf8);
     }
     {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
-       checker.check_next_input(utf8);
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - ptr)) - 1, (const __m512i*)ptr);
+        checker.check_next_input(utf8);
     }
     checker.check_eof();
-    return ! checker.errors();
+    return !checker.errors();
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-    avx512_utf8_checker checker{};
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    avx512_utf8_checker checker {};
     const char* ptr = buf;
     const char* end = ptr + len;
-    size_t count{0};
+    size_t count { 0 };
     for (; ptr + 64 <= end; ptr += 64) {
-      const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-      checker.check_next_input(utf8);
-      if(checker.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
-        res.count += count;
-        return res;
-      }
-      count += 64;
+        const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
+        checker.check_next_input(utf8);
+        if (checker.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
+            res.count += count;
+            return res;
+        }
+        count += 64;
     }
     {
-      const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
-      checker.check_next_input(utf8);
-      if(checker.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
-        res.count += count;
-        return res;
-      } else {
-        return result(error_code::SUCCESS, len);
-      }
-    }
-}
-
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return icelake::validate_ascii(buf, len);
-}
-
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  const char* buf_orig = buf;
-  const char* end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  for (; buf + 64 <= end; buf += 64) {
-    const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if(notascii) {
-      return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  {
-    const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if(notascii) {
-      return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  return result(error_code::SUCCESS, len);
-}
-
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-    const char16_t *end = buf + len;
-
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_loadu_si512((__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+        const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL << (end - ptr)) - 1, (const __m512i*)ptr);
+        checker.check_next_input(utf8);
+        if (checker.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf + count), len - count);
+            res.count += count;
+            return res;
+        } else {
+            return result(error_code::SUCCESS, len);
+        }
+    }
+}
+
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return icelake::validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    const char* buf_orig = buf;
+    const char* end = buf + len;
+    const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+    for (; buf + 64 <= end; buf += 64) {
+        const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
+        __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+        if (notascii) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
+        }
+    }
+    {
+        const __m512i input = _mm512_maskz_loadu_epi8((1ULL << (end - buf)) - 1, (const __m512i*)buf);
+        __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+        if (notascii) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
+        }
+    }
+    return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* end = buf + len;
+
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_loadu_si512((__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
+            buf += 32;
         }
-      } else {
-        buf += 32;
-      }
     }
-    if(buf < end) {
-      __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
+    if (buf < end) {
+        __m512i in = _mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
         }
-      }
     }
     return true;
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-   const char16_t *end = buf + len;
-   const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* end = buf + len;
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
+            buf += 32;
         }
-      } else {
-        buf += 32;
-      }
     }
-    if(buf < end) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-           return false;
+    if (buf < end) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                return false;
+            }
         }
-      }
     }
     return true;
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-    const char16_t *start_buf = buf;
-    const char16_t *end = buf + len;
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_loadu_si512((__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* start_buf = buf;
+    const char16_t* end = buf + len;
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_loadu_si512((__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
-        }
-      } else {
-        buf += 32;
-      }
-    }
-    if(buf < end) {
-      __m512i in = _mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-      }
+            buf += 32;
+        }
+    }
+    if (buf < end) {
+        __m512i in = _mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+        }
     }
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-    const char16_t *start_buf = buf;
-    const char16_t *end = buf + len;
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* start_buf = buf;
+    const char16_t* end = buf + len;
     const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    for(;buf + 32 <= end; ) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-        bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-        if(ends_with_high) {
-          buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    for (; buf + 32 <= end;) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+            bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+            if (ends_with_high) {
+                buf += 31; // advance only by 31 words so that we start with the high surrogate on the next round.
+            } else {
+                buf += 32;
+            }
         } else {
-          buf += 32;
-        }
-      } else {
-        buf += 32;
-      }
-    }
-    if(buf < end) {
-      __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1<<(end-buf))-1,(__m512i*)buf), byteflip);
-      __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-      __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-      if(surrogates) {
-        __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-        __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-        // high must be followed by low
-        if ((highsurrogates << 1) != lowsurrogates) {
-          uint32_t extra_low = _tzcnt_u32(lowsurrogates &~(highsurrogates << 1));
-          uint32_t extra_high = _tzcnt_u32(highsurrogates &~(lowsurrogates >> 1));
-          return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
-        }
-      }
+            buf += 32;
+        }
+    }
+    if (buf < end) {
+        __m512i in = _mm512_shuffle_epi8(_mm512_maskz_loadu_epi16((1 << (end - buf)) - 1, (__m512i*)buf), byteflip);
+        __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+        __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+        if (surrogates) {
+            __mmask32 highsurrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+            __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+            // high must be followed by low
+            if ((highsurrogates << 1) != lowsurrogates) {
+                uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+                uint32_t extra_high = _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+                return result(error_code::SURROGATE, (buf - start_buf) + (extra_low < extra_high ? extra_low : extra_high));
+            }
+        }
     }
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t * tail = icelake::validate_utf32(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = icelake::validate_utf32(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
 
     const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
     const char32_t* buf_orig = buf;
     while (buf <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
-      }
-
-      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-                                _MM_CMPINT_GT);
-      if (surrogate_range) {
-        return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
-      }
-      buf += 16;
-    }
-    if(buf < buf_orig + len) {
-      __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1<<(buf_orig + len - buf))-1),(const __m512i*)buf);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-                                _MM_CMPINT_GT);
-      if (outside_range) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
-      }
-      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-                                _MM_CMPINT_GT);
-      if (surrogate_range) {
-        return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
-      }
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+        __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+            _MM_CMPINT_GT);
+        if (outside_range) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
+        }
+
+        __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+        __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+            _MM_CMPINT_GT);
+        if (surrogate_range) {
+            return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
+        }
+        buf += 16;
+    }
+    if (buf < buf_orig + len) {
+        __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1 << (buf_orig + len - buf)) - 1), (const __m512i*)buf);
+        __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+            _MM_CMPINT_GT);
+        if (outside_range) {
+            return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u32(outside_range));
+        }
+        __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+
+        __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+            _MM_CMPINT_GT);
+        if (surrogate_range) {
+            return result(error_code::SURROGATE, buf - buf_orig + _tzcnt_u32(surrogate_range));
+        }
     }
 
     return result(error_code::SUCCESS, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
-  }
-  return ret.second - utf16_output;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
-  }
-  return ret.second - utf16_output;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-   return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.second == nullptr) {
+        return 0;
+    }
+    return ret.second - utf16_output;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.second == nullptr) {
+        return 0;
+    }
+    return ret.second - utf16_output;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(buf, len, utf16_output);
+    size_t saved_bytes = ret.second - utf16_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outsiede 16-byte window.
+    //       It meas, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
+    size_t saved_bytes = ret.second - utf16_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outsiede 16-byte window.
+    //       It meas, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+    if (ret.second == nullptr)
+        return 0;
+
+    size_t saved_bytes = ret.second - utf32_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: the AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outside 16-byte window.
+    //       It means, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
+            ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
     return saved_bytes;
-  }
-
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
-
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16_result ret = icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32);
+    auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        auto new_buf = std::get<0>(ret);
+        // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
+        // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
+        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t*>(std::get<1>(ret)));
+        res.count += (std::get<0>(ret) - buf);
+        return res;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    const char* end = buf + len;
+    if (std::get<0>(ret) == end) {
+        return { simdutf::SUCCESS, saved_bytes };
+    }
+
+    // Note: the AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outside 16-byte window.
+    //       It means, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
+        std::get<0>(ret) += 1;
+    }
+
+    if (std::get<0>(ret) != end) {
+        auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t*>(utf32_output) + saved_bytes);
+        if (scalar_result.error != simdutf::SUCCESS) {
+            scalar_result.count += (std::get<0>(ret) - buf);
+        } else {
+            scalar_result.count += saved_bytes;
+        }
+        return scalar_result;
+    }
+
+    return { simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output) };
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept
+{
+    uint32_t* utf32_output = reinterpret_cast<uint32_t*>(utf32_out);
+    utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+    size_t saved_bytes = ret.second - utf32_output;
+    const char* end = buf + len;
+    if (ret.first == end) {
+        return saved_bytes;
+    }
+
+    // Note: AVX512 procedure looks up 4 bytes forward, and
+    //       correctly converts multi-byte chars even if their
+    //       continuation bytes lie outsiede 16-byte window.
+    //       It meas, we have to skip continuation bytes from
+    //       the beginning ret.first, as they were already consumed.
+    while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+        ret.first += 1;
+    }
+
+    if (ret.first != end) {
+        const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
+            ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+
     return saved_bytes;
-  }
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        return 0;
+    }
+    return outlen;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        return 0;
+    }
+    return outlen;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
+        res.count += inlen;
+        return res;
+    }
+    return { simdutf::SUCCESS, outlen };
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    size_t outlen;
+    size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
+    if (inlen != len) {
+        result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
+        res.count += inlen;
+        return res;
+    }
+    return { simdutf::SUCCESS, outlen };
+}
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
 
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret = icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-  if (ret.second == nullptr)
-    return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
 
-  size_t saved_bytes = ret.second - utf32_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
     return saved_bytes;
-  }
-
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
-
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
-                                        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
-  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    auto new_buf = std::get<0>(ret);
-    // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
-    // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
-    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
-    res.count += (std::get<0>(ret) - buf);
-    return res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  const char* end = buf + len;
-  if (std::get<0>(ret) == end) {
-    return {simdutf::SUCCESS, saved_bytes};
-  }
-
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (std::get<0>(ret) != end and ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
-      std::get<0>(ret) += 1;
-  }
-
-  if (std::get<0>(ret) != end) {
-    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
-    if (scalar_result.error != simdutf::SUCCESS) {
-      scalar_result.count +=  (std::get<0>(ret) - buf);
-    } else {
-      scalar_result.count += saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
     }
-    return scalar_result;
-  }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
 
-  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_out) const noexcept {
-  uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret = icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
-  size_t saved_bytes = ret.second - utf32_output;
-  const char* end = buf + len;
-  if (ret.first == end) {
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
     return saved_bytes;
-  }
-
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-      ret.first += 1;
-  }
-
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
-                                        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) { return 0; }
-  return outlen;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) { return 0; }
-  return outlen;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
-  if(inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = avx512_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
-    }
-  }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
-    }
-  }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) { return 0; }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  size_t pos = 0;
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (pos + 32 <= length) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_storeu_si512(output + pos, utf16);
-    pos += 32;
-  }
-  if(pos < length) {
-    __mmask32 m((1<< (length - pos))-1);
-    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_mask_storeu_epi16(output + pos, m, utf16);
-  }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        scalar_res.count += (std::get<0>(ret) - buf);
+        return scalar_res;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_res.error) {
+            scalar_res.count += (std::get<0>(ret) - buf);
+            return scalar_res;
+        } else {
+            scalar_res.count += saved_bytes;
+            return scalar_res;
+        }
+    }
+    return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        scalar_res.count += (std::get<0>(ret) - buf);
+        return scalar_res;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_res.error) {
+            scalar_res.count += (std::get<0>(ret) - buf);
+            return scalar_res;
+        } else {
+            scalar_res.count += saved_bytes;
+            return scalar_res;
+        }
+    }
+    return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
 
-  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::tuple<const char16_t*, char32_t*, bool> ret = icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (!std::get<2>(ret)) {
+        return 0;
+    }
+    size_t saved_bytes = std::get<1>(ret) - utf32_output;
+    if (std::get<0>(ret) != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    size_t pos = 0;
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (pos + 32 <= length) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)(input + pos));
+        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+        _mm512_storeu_si512(output + pos, utf16);
+        pos += 32;
+    }
+    if (pos < length) {
+        __mmask32 m((1 << (length - pos)) - 1);
+        __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i*)(input + pos));
+        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+        _mm512_mask_storeu_epi16(output + pos, m, utf16);
+    }
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  size_t count{0};
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 32;
-    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-    count += count_ones(not_high_surrogate);
-  }
+    size_t count { 0 };
 
-  return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 32;
+        uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+        count += count_ones(not_high_surrogate);
+    }
+
+    return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-  size_t count{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
-    ptr += 32;
-    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-    count += count_ones(not_high_surrogate);
-  }
+    size_t count { 0 };
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
+        ptr += 32;
+        uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+        count += count_ones(not_high_surrogate);
+    }
 
-  return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
+    return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
 }
 
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    const char* end = length >= 64 ? input + length - 64 : nullptr;
+    const char* ptr = input;
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  const char* end = length >= 64 ? input + length - 64 : nullptr;
-  const char* ptr = input;
+    const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
-  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
+    size_t count { 0 };
 
-  size_t count{0};
+    while (ptr <= end) {
+        __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 64;
+        uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
+        count += 64 - count_ones(continuation_bitmask);
+    }
 
-  while (ptr <= end) {
-    __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 64;
-    uint64_t continuation_bitmask = static_cast<uint64_t>(_mm512_cmple_epi8_mask(utf8, continuation));
-    count += 64 - count_ones(continuation_bitmask);
-  }
+    return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
+}
 
-  return count + scalar::utf8::count_code_points(ptr, length - (ptr - input));
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
 }
 
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  size_t count{0};
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 32;
-    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+    size_t count { 0 };
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 32;
+        __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+        __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+        __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+        __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
-  }
+        size_t ascii_count = count_ones(ascii_bitmask);
+        size_t two_bytes_count = count_ones(two_bytes_bitmask);
+        size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+        size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
 
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
+        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 2 * surrogate_bytes_count;
+    }
+
+    return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
-  const char16_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
+    const char16_t* ptr = input;
 
-  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-  size_t count{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    ptr += 32;
-    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+    size_t count { 0 };
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809,
+        0x0607040502030001,
+        0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+        __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+        utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+        ptr += 32;
+        __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+        __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+        __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+        __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+
+        size_t ascii_count = count_ones(ascii_bitmask);
+        size_t two_bytes_count = count_ones(two_bytes_bitmask);
+        size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+        size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 2 * surrogate_bytes_count;
+    }
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
-  }
+    return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return implementation::count_utf16le(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return implementation::count_utf16be(input, length);
+}
 
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return implementation::count_utf16le(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return implementation::count_utf16be(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= length; pos += 64) {
-      __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
-      uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= length; pos += 64) {
+        __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input + pos));
+        uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        uint64_t utf8_4byte = _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
-  const char32_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
+    const char32_t* ptr = input;
 
-  const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
-  const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
-  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-  size_t count{0};
+    size_t count { 0 };
 
-  while (ptr <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 16;
-    __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
-    __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
-    __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
+    while (ptr <= end) {
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 16;
+        __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+        __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+        __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t three_bytes_count = count_ones(three_bytes_bitmask);
-    size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
-  }
+        size_t ascii_count = count_ones(ascii_bitmask);
+        size_t two_bytes_count = count_ones(two_bytes_bitmask);
+        size_t three_bytes_count = count_ones(three_bytes_bitmask);
+        size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
+        count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count + 4 * four_bytes_count;
+    }
 
-  return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
+    return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
-  const char32_t* ptr = input;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
+    const char32_t* ptr = input;
 
-  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-  size_t count{0};
+    size_t count { 0 };
 
-  while (ptr <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 16;
-    __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+    while (ptr <= end) {
+        __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+        ptr += 16;
+        __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
-    count += 16 + count_ones(surrogates_bitmask);
-  }
+        count += 16 + count_ones(surrogates_bitmask);
+    }
 
-  return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
+    return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return implementation::count_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return implementation::count_utf8(input, length);
 }
 
 } // namespace icelake
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/icelake/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/icelake/end.h
 /* begin file src/simdutf/icelake/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
 // nothing needed.
@@ -19194,7 +22154,6 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -19202,10 +22161,10 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* end file src/icelake/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_HASWELL
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/implementation.cpp
 /* begin file src/haswell/implementation.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/begin.h
 /* begin file src/simdutf/haswell/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "haswell"
 // #define SIMDUTF_IMPLEMENTATION haswell
@@ -19217,7 +22176,7 @@ SIMDUTF_TARGET_HASWELL
 #endif
 
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe - uninitialized)
 #endif // end of workaround
 /* end file src/simdutf/haswell/begin.h */
 namespace simdutf {
@@ -19228,31 +22187,34 @@ namespace {
 #endif
 using namespace simd;
 
-
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
-  return input.reduce_or().is_ascii();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
+    return input.reduce_or().is_ascii();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_detect_encodings.cpp
 /* begin file src/haswell/avx2_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int avx2_detect_encodings(const char * buf, size_t len) {
+int avx2_detect_encodings(const char* buf, size_t len)
+{
     const char* start = buf;
     const char* end = buf + len;
 
@@ -19267,11 +22229,11 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 
     __m256i currentmax = _mm256_setzero_si256();
 
-    checker check{};
+    checker check {};
 
-    while(buf + 64 <= end) {
+    while (buf + 64 <= end) {
         __m256i in = _mm256_loadu_si256((__m256i*)buf);
-        __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(nextin);
@@ -19297,15 +22259,15 @@ int avx2_detect_encodings(const char * buf, size_t len) {
             if ((surrogates_bitmask0 & 0xaaaaaaaa) != 0) {
                 is_utf32 = false;
                 // Code from avx2_validate_utf16le.cpp
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
+                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint32_t V0 = ~surrogates_bitmask0;
 
-                const auto    vH0 = (in16 & v_fc) == v_dc;
+                const auto vH0 = (in16 & v_fc) == v_dc;
                 const uint32_t H0 = vH0.to_bitmask();
 
                 const uint32_t L0 = ~H0 & surrogates_bitmask0;
@@ -19338,7 +22300,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
                     } else {
                         const uint32_t V = ~surrogates_bitmask;
 
-                        const auto    vH = (in_16 & v_fc) == v_dc;
+                        const auto vH = (in_16 & v_fc) == v_dc;
                         const uint32_t H = vH.to_bitmask();
 
                         const uint32_t L = ~H & surrogates_bitmask;
@@ -19362,8 +22324,8 @@ int avx2_detect_encodings(const char * buf, size_t len) {
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
+                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
 
                     // Must start checking for surrogates
                     __m256i currentoffsetmax = _mm256_setzero_si256();
@@ -19377,14 +22339,14 @@ int avx2_detect_encodings(const char * buf, size_t len) {
                     currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(nextin, offset), currentoffsetmax);
 
                     while (input + 8 < end32) {
-                        const __m256i in32 = _mm256_loadu_si256((__m256i *)input);
-                        currentmax = _mm256_max_epu32(in32,currentmax);
+                        const __m256i in32 = _mm256_loadu_si256((__m256i*)input);
+                        currentmax = _mm256_max_epu32(in32, currentmax);
                         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in32, offset), currentoffsetmax);
                         input += 8;
                     }
 
                     __m256i forbidden_words = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
+                    if (_mm256_testz_si256(forbidden_words, forbidden_words) == 0) {
                         return simdutf::encoding_type::unspecified;
                     }
                 } else {
@@ -19411,7 +22373,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
+            uint8_t block[64] {};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -19422,14 +22384,14 @@ int avx2_detect_encodings(const char * buf, size_t len) {
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -19438,7 +22400,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 }
 /* end file src/haswell/avx2_detect_encodings.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_validate_utf16.cpp
 /* begin file src/haswell/avx2_validate_utf16.cpp */
 /*
     In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
@@ -19485,8 +22447,9 @@ int avx2_detect_encodings(const char * buf, size_t len) {
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-template <endianness big_endian>
-const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const char16_t* avx2_validate_utf16(const char16_t* input, size_t size)
+{
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -19528,19 +22491,19 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
             const uint32_t V = ~surrogates_bitmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint32_t H = vH.to_bitmask();
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint32_t L = ~H & surrogates_bitmask;
 
-            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
+            const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
+            const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
+                                       // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
             if (c == 0xffffffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -19561,9 +22524,9 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
     return input;
 }
 
-
-template <endianness big_endian>
-const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
+{
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -19606,19 +22569,19 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint32_t V = ~surrogates_bitmask;
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint32_t H = vH.to_bitmask();
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint32_t L = ~H & surrogates_bitmask;
 
-            const uint32_t a = L & (H >> 1);  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint32_t b = a << 1;        // Just mark that the opposite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint32_t c = V | a | b;     // Combine all the masks into the final one.
+            const uint32_t a = L & (H >> 1); // A low surrogate must be followed by high one.
+                                             // (A low surrogate placed in the 7th register's word
+                                             // is an exception we handle.)
+            const uint32_t b = a << 1; // Just mark that the opposite fact is hold,
+                                       // thanks to that we have only two masks for valid case.
+            const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
             if (c == 0xffffffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -19639,13 +22602,14 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
     return result(error_code::SUCCESS, input - start);
 }
 /* end file src/haswell/avx2_validate_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_validate_utf32le.cpp
 /* begin file src/haswell/avx2_validate_utf32le.cpp */
 /* Returns:
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
+const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size)
+{
     const char32_t* end = input + size;
 
     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
@@ -19655,26 +22619,26 @@ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
     __m256i currentoffsetmax = _mm256_setzero_si256();
 
     while (input + 8 < end) {
-        const __m256i in = _mm256_loadu_si256((__m256i *)input);
-        currentmax = _mm256_max_epu32(in,currentmax);
+        const __m256i in = _mm256_loadu_si256((__m256i*)input);
+        currentmax = _mm256_max_epu32(in, currentmax);
         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
         input += 8;
     }
     __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-    if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     return input;
 }
 
-
-const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size)
+{
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -19685,17 +22649,17 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz
     __m256i currentoffsetmax = _mm256_setzero_si256();
 
     while (input + 8 < end) {
-        const __m256i in = _mm256_loadu_si256((__m256i *)input);
-        currentmax = _mm256_max_epu32(in,currentmax);
+        const __m256i in = _mm256_loadu_si256((__m256i*)input);
+        currentmax = _mm256_max_epu32(in, currentmax);
         currentoffsetmax = _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
 
         __m256i is_zero = _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-        if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if(_mm256_testz_si256(is_zero, is_zero) == 0) {
+        if (_mm256_testz_si256(is_zero, is_zero) == 0) {
             return result(error_code::SURROGATE, input - start);
         }
         input += 8;
@@ -19705,303 +22669,286 @@ const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t siz
 }
 /* end file src/haswell/avx2_validate_utf32le.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf16.cpp
 /* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    __m256i ascii = _mm256_cvtepu8_epi16(in);
-    if (big_endian) {
-      const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      ascii = _mm256_shuffle_epi8(ascii, swap256);
-    }
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-    return 12;
-  }
-
-  const uint8_t idx =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    const __m128i composedminus =
-        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-    const __m128i lowtenbits =
-        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-    const __m128i hightenbits = _mm_srli_epi32(composedminus, 10);
-    const __m128i lowtenbitsadd =
-        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-    const __m128i hightenbitsadd =
-        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-    __m128i surrogates =
-        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (big_endian) {
-      _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
-      surrogates = _mm_shuffle_epi8(surrogates, swap);
-    }
-    _mm_storeu_si128((__m128i *)basic_buffer, composed);
-    uint32_t surrogate_buffer[4];
-    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
-    for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] < 65536) {
-        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-        utf16_output++;
-      } else {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-        utf16_output += 2;
-      }
-    }
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+template<endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char16_t*& utf16_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        __m256i ascii = _mm256_cvtepu8_epi16(in);
+        if (big_endian) {
+            const __m256i swap256 = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            ascii = _mm256_shuffle_epi8(ascii, swap256);
+        }
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf16_output), ascii);
+        utf16_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4;
+        return 12;
+    }
+
+    const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential overflow of 4 bytes.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4; // Here we overflow by 8 bytes.
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        //////////////
+        // There might be garbage inputs where a leading byte mascarades as a four-byte
+        // leading byte (by being followed by 3 continuation byte), but is not greater than
+        // 0xf0. This could trigger a buffer overflow if we only counted leading
+        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+        // We do as at the cost of an extra mask.
+        /////////////
+        const __m128i sh = _mm_loadu_si128((const __m128i*)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        // We deliberately carry the leading four bits in highbyte if they are present,
+        // we remove them later when computing hightenbits.
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        // When we need to generate a surrogate pair (leading byte > 0xF0), then
+        // the corresponding 32-bit value in 'composed'  will be greater than
+        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+        // location of the surrogate pairs.
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        const __m128i composedminus = _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+        const __m128i lowtenbits = _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+        // Notice the 0x3ff mask:
+        const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+        const __m128i lowtenbitsadd = _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+        const __m128i hightenbitsadd = _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+        const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+        __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+        uint32_t basic_buffer[4];
+        uint32_t basic_buffer_swap[4];
+        if (big_endian) {
+            _mm_storeu_si128((__m128i*)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
+            surrogates = _mm_shuffle_epi8(surrogates, swap);
+        }
+        _mm_storeu_si128((__m128i*)basic_buffer, composed);
+        uint32_t surrogate_buffer[4];
+        _mm_storeu_si128((__m128i*)surrogate_buffer, surrogates);
+        for (size_t i = 0; i < 3; i++) {
+            if (basic_buffer[i] > 0x3c00000) {
+                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+                utf16_output += 2;
+            } else {
+                utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+                utf16_output++;
+            }
+        }
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf8_to_utf32.cpp
 /* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char32_t *&utf32_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu8_epi32(in));
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output+8), _mm256_cvtepu8_epi32(_mm_srli_si128(in,8)));
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
-    utf32_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output, _mm256_cvtepu16_epi32(composed));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 3;
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+size_t convert_masked_utf8_to_utf32(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char32_t*& utf32_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu8_epi32(in));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+        utf32_output += 16; // We wrote 16 32-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm256_storeu_si256((__m256i*)utf32_output, _mm256_cvtepu16_epi32(composed));
+        utf32_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm256_storeu_si256((__m256i*)utf32_output, _mm256_cvtepu16_epi32(composed));
+        utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+        // overflow of 32 - 24 = 8 bytes.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf8.cpp
 /* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -20052,489 +22999,493 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     - We need two 256-entry tables that have 8704 bytes in total.
 */
 
-
 /*
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
-  const char16_t* end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m256i t0 = _mm256_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m256i t2 = _mm256_and_si256(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m256i t3 = _mm256_or_si256(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          const uint32_t M0 = one_byte_bitmask & 0x55555555;
-          const uint32_t M1 = M0 >> 7;
-          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-          // 4. pack the bytes
-
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-          utf8_output += row[0];
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-          utf8_output += row_2[0];
-
-          // 6. adjust pointers
-          buf += 16;
-          continue;
-    }
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                                0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m256i s0 = _mm256_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-        // Due to the wider registers, the following path is less likely to be useful.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-          utf8_output += 12;
-          buf += 16;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-        utf8_output += row2[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-        utf8_output += row3[0];
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+template<endianness big_endian>
+std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const char16_t* end = buf + len;
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+    const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+        if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, utf8_output);
+    } // while
+    return std::make_pair(buf, utf8_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if(_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-          const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m256i t0 = _mm256_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m256i t2 = _mm256_and_si256(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m256i t3 = _mm256_or_si256(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          const uint32_t M0 = one_byte_bitmask & 0x55555555;
-          const uint32_t M1 = M0 >> 7;
-          const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-          // 4. pack the bytes
-
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-          const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-          const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-          utf8_output += row[0];
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-          utf8_output += row_2[0];
-
-          // 6. adjust pointers
-          buf += 16;
-          continue;
-    }
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                                0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m256i s0 = _mm256_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-        const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-        const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                              (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-        // Due to the wider registers, the following path is less likely to be useful.
-        /*if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-          const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-          utf8_output += 12;
-          buf += 16;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-        const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-        const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-        const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-        const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-        const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-        const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-        const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-        const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-        utf8_output += row2[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-        utf8_output += row3[0];
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+template<endianness big_endian>
+std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+    const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+        if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf16_to_utf32.cpp
 /* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -20585,760 +23536,793 @@ std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t*
     - We need two 256-entry tables that have 8704 bytes in total.
 */
 
-
 /*
   Returns a pair: the first unprocessed byte from buf and utf32_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit words to sixteen 32-bit words
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
-        utf32_output += 16;
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          // No surrogate pair
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* end = buf + len;
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: we extend all sixteen 16-bit words to sixteen 32-bit words
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+            utf32_output += 16;
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    // No surrogate pair
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, utf32_output);
+    } // while
+    return std::make_pair(buf, utf32_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-
-  while (buf + 16 <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
-                                  17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit words to sixteen 32-bit words
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-        _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in,1)));
-        utf32_output += 16;
-        buf += 16;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          // No surrogate pair
-          *utf32_output++ = char32_t(word);
+template<endianness big_endian>
+std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+    const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+    const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+    while (buf + 16 <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        if (big_endian) {
+            const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
+                17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+            in = _mm256_shuffle_epi8(in, swap);
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m256i surrogates_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint32_t surrogates_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x00000000) {
+            // case: we extend all sixteen 16-bit words to sixteen 32-bit words
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output), _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(utf32_output + 8), _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+            utf32_output += 16;
+            buf += 16;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    // No surrogate pair
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
 /* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf8.cpp
 /* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
-
-  return std::make_pair(buf, utf8_output);
-}
-
-
-std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-  const char32_t* end = buf + len;
-  const char32_t* start = buf;
-
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
-    // Check for too large input
-    const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if(static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
-
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
-
-    if(_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16),_mm256_extractf128_si256(in_16,1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0)  & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2>>16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle,shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed,1));
-      utf8_output += row_2[0];
-
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-      // Check for illegal surrogate words
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-      }
-
-      const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-                                              0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be useful.
-      /*if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
-        const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
-        utf8_output += 12;
-        buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
-      const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0,1), shuffle2);
-
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
-      const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1,1), shuffle3);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // may require large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {  // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) { // 2-byte
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word & 0xFFFF0000 )==0) {  // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {  // 4-byte
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    __m256i running_max = _mm256_setzero_si256();
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    // check for invalid input
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(_mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    return std::make_pair(buf, utf8_output);
+}
+
+std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+    const char32_t* start = buf;
+
+    const __m256i v_0000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+    const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+    const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+    const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+    const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        __m256i nextin = _mm256_loadu_si256((__m256i*)buf + 1);
+        // Check for too large input
+        const __m256i max_input = _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+        if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff), _mm256_and_si256(nextin, v_7fffffff));
+        in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+        // Try to apply UTF-16 => UTF-8 routine on 256 bits (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+        if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+            // 1. pack the bytes
+            const __m128i utf8_packed = _mm_packus_epi16(_mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+            // 2. store (16 bytes)
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+            // 3. adjust pointers
+            buf += 16;
+            utf8_output += 16;
+            continue; // we are done for this round!
+        }
+        // no bits set above 7th bit
+        const __m256i one_byte_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+        const uint32_t one_byte_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m256i one_or_two_bytes_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+        const uint32_t one_or_two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+        if (one_or_two_bytes_bitmask == 0xffffffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+            const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m256i t3 = _mm256_or_si256(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            const uint32_t M0 = one_byte_bitmask & 0x55555555;
+            const uint32_t M1 = M0 >> 7;
+            const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+            // 4. pack the bytes
+
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+            const uint8_t* row_2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)][0];
+
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i shuffle_2 = _mm_loadu_si128((__m128i*)(row_2 + 1));
+
+            const __m256i utf8_packed = _mm256_shuffle_epi8(utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_packed));
+            utf8_output += row[0];
+            _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_packed, 1));
+            utf8_output += row_2[0];
+
+            // 6. adjust pointers
+            buf += 16;
+            continue;
+        }
+        // Must check for overflow in packing
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+            // Check for illegal surrogate words
+            const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+            }
+
+            const __m256i dup_even = _mm256_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+                0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+              1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+              2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+              3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+            const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+            const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint32_t mask = (one_byte_bitmask & 0x55555555) | (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+            // Due to the wider registers, the following path is less likely to be useful.
+            /*if(mask == 0) {
+              // We only have three-byte words. Use fast path.
+              const __m256i shuffle = _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1, 2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
+              const __m256i utf8_0 = _mm256_shuffle_epi8(out0, shuffle);
+              const __m256i utf8_1 = _mm256_shuffle_epi8(out1, shuffle);
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_0,1));
+              utf8_output += 12;
+              _mm_storeu_si128((__m128i*)utf8_output, _mm256_extractf128_si256(utf8_1,1));
+              utf8_output += 12;
+              buf += 16;
+              continue;
+            }*/
+            const uint8_t mask0 = uint8_t(mask);
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+            const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+            const uint8_t* row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+            const __m128i shuffle2 = _mm_loadu_si128((__m128i*)(row2 + 1));
+            const __m128i utf8_2 = _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+            const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+            const uint8_t* row3 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+            const __m128i shuffle3 = _mm_loadu_si128((__m128i*)(row3 + 1));
+            const __m128i utf8_3 = _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_2);
+            utf8_output += row2[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_3);
+            utf8_output += row3[0];
+            buf += 16;
+        } else {
+            // case: at least one 32-bit word is larger than 0xFFFF <=> it will produce four UTF-8 bytes.
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // may require large, non-trivial tables?
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else { // 4-byte
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=haswell/avx2_convert_utf32_to_utf16.cpp
 /* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* end = buf + len;
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* end = buf + len;
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    __m256i forbidden_bytemask = _mm256_setzero_si256();
 
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(forbidden_bytemask, _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
+    // check for invalid input
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf16_output);
+    }
 
-  return std::make_pair(buf, utf16_output);
+    return std::make_pair(buf, utf16_output);
 }
 
+template<endianness big_endian>
+std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
 
-template <endianness big_endian>
-std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    while (buf + 8 + safety_margin <= end) {
+        __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
-  while (buf + 8 + safety_margin <= end) {
-    __m256i in = _mm256_loadu_si256((__m256i*)buf);
+        const __m256i v_00000000 = _mm256_setzero_si256();
+        const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
-
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-      }
-
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),_mm256_extractf128_si256(in,1));
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+        // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+        const __m256i saturation_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+        if (saturation_bitmask == 0xffffffff) {
+            const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+            const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+            const __m256i forbidden_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0x0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+            }
+
+            __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace haswell {
@@ -21348,92 +24332,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace haswell {
@@ -21442,21 +24444,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -21464,101 +24467,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -21569,51 +24563,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -21622,7 +24619,7 @@ using utf8_validation::utf8_checker;
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace haswell {
@@ -21633,15 +24630,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -21650,97 +24648,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -21749,10 +24756,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
@@ -21760,63 +24766,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -21824,32 +24831,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -21857,258 +24863,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -22118,68 +25147,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -22187,251 +25214,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -22441,36 +25490,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -22478,64 +25528,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace haswell
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace haswell {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -22547,466 +25605,667 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace haswell {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        return avx2_detect_encodings<utf8_validation::utf8_checker>(input, length);
     } else {
-      return simdutf::encoding_type::unspecified;
+        if (implementation::validate_utf8(input, length)) {
+            return simdutf::encoding_type::UTF8;
+        } else {
+            return simdutf::encoding_type::unspecified;
+        }
     }
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8(buf,len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii(buf,len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
+}
+
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = avx2_validate_utf32le(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    result res = avx2_validate_utf32le_with_errors(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = avx2_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-   return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-   return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    char32_t* utf32_output) const noexcept
+{
+    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
     }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
     }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
-  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
-    const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
-    const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
-    const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
-    const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
-    const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 32 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
-  }
-  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
-    const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (32-count_ones(surrogate_bitmask))/4;
-    count += 8 + surrogate_count;
-  }
-  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+    const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 8 <= length; pos += 8) {
+        __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
+        const __m256i ascii_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+        const __m256i one_two_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+        const __m256i two_bytes_bytemask = _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+        const __m256i one_two_three_bytes_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const __m256i three_bytes_bytemask = _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+        const uint32_t ascii_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+        const uint32_t two_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+        const uint32_t three_bytes_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+
+        size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+        size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+        size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+        count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    }
+    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 8 <= length; pos += 8) {
+        __m256i in = _mm256_loadu_si256((__m256i*)(input + pos));
+        const __m256i surrogate_bytemask = _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+        const uint32_t surrogate_bitmask = static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+        size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
+        count += 8 + surrogate_count;
+    }
+    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace haswell
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/haswell/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/haswell/end.h
 /* begin file src/simdutf/haswell/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
 // nothing needed.
@@ -23014,7 +26273,6 @@ simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * i
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
 #if SIMDUTF_GCC11ORMORE // workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
 SIMDUTF_POP_DISABLE_WARNINGS
 #endif // end of workaround
@@ -23022,14 +26280,10 @@ SIMDUTF_POP_DISABLE_WARNINGS
 /* end file src/haswell/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_PPC64
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=ppc64/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=ppc64/implementation.cpp
 /* begin file src/ppc64/implementation.cpp */
 
-
-
-
-
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/begin.h
 /* begin file src/simdutf/ppc64/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "ppc64"
 // #define SIMDUTF_IMPLEMENTATION ppc64
@@ -23042,32 +26296,34 @@ namespace {
 #endif
 using namespace simd;
 
-
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
-  // careful: 0x80 is not ascii.
-  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
+    // careful: 0x80 is not ascii.
+    return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace ppc64 {
@@ -23077,92 +26333,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace ppc64 {
@@ -23171,21 +26445,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -23193,101 +26468,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -23298,51 +26564,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -23351,7 +26620,7 @@ using utf8_validation::utf8_checker;
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace ppc64 {
@@ -23362,15 +26631,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -23379,97 +26649,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -23478,10 +26757,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace ppc64 {
 namespace {
@@ -23489,63 +26767,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -23553,32 +26832,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -23586,258 +26864,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -23847,68 +27148,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -23916,251 +27215,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -24170,36 +27491,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -24207,64 +27529,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace ppc64
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace ppc64 {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -24279,242 +27609,303 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace ppc64 {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  int out = 0;
-  if(validate_utf8(input, length)) { out |= encoding_type::UTF8; }
-  if((length % 2) == 0) {
-    if(validate_utf16(reinterpret_cast<const char16_t*>(input), length/2)) { out |= encoding_type::UTF16_LE; }
-  }
-  if((length % 4) == 0) {
-    if(validate_utf32(reinterpret_cast<const char32_t*>(input), length/4)) { out |= encoding_type::UTF32_LE; }
-  }
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    int out = 0;
+    if (validate_utf8(input, length)) {
+        out |= encoding_type::UTF8;
+    }
+    if ((length % 2) == 0) {
+        if (validate_utf16(reinterpret_cast<const char16_t*>(input), length / 2)) {
+            out |= encoding_type::UTF16_LE;
+        }
+    }
+    if ((length % 4) == 0) {
+        if (validate_utf32(reinterpret_cast<const char32_t*>(input), length / 4)) {
+            out |= encoding_type::UTF32_LE;
+        }
+    }
 
-  return out;
+    return out;
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8(buf,len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii(buf,len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::BIG>(buf, len);
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    return scalar::utf32::validate_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate(buf, len);
+simdutf_warn_unused bool implementation::validate_utf32(const char16_t* buf, size_t len) const noexcept
+{
+    return scalar::utf32::validate(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* /*buf*/, size_t /*len*/, char16_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
+{
+    return result(error_code::OTHER, 0); // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept {
-  return 0; // stub
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* /*buf*/, size_t /*len*/, char32_t* /*utf16_output*/) const noexcept
+{
+    return 0; // stub
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len, utf32_output);
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    scalar::utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::utf16_length_from_utf8(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf8_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    return scalar::utf32::utf16_length_from_utf32(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace ppc64
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/ppc64/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/ppc64/end.h
 /* begin file src/simdutf/ppc64/end.h */
 /* end file src/simdutf/ppc64/end.h */
 /* end file src/ppc64/implementation.cpp */
 #endif
 #if SIMDUTF_IMPLEMENTATION_WESTMERE
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/implementation.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/implementation.cpp
 /* begin file src/westmere/implementation.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/begin.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/begin.h
 /* begin file src/simdutf/westmere/begin.h */
 // redefining SIMDUTF_IMPLEMENTATION to "westmere"
 // #define SIMDUTF_IMPLEMENTATION westmere
@@ -24533,30 +27924,34 @@ namespace {
 #endif
 using namespace simd;
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input) {
-  return input.reduce_or().is_ascii();
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t>& input)
+{
+    return input.reduce_or().is_ascii();
 }
 
-simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u-1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_second_byte = prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte  = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
+simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3)
+{
+    simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+    simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+    // Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
+    return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
 }
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_detect_encodings.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_detect_encodings.cpp
 /* begin file src/westmere/sse_detect_encodings.cpp */
 template<class checker>
 // len is known to be a multiple of 2 when this is called
-int sse_detect_encodings(const char * buf, size_t len) {
+int sse_detect_encodings(const char* buf, size_t len)
+{
     const char* start = buf;
     const char* end = buf + len;
 
@@ -24571,13 +27966,13 @@ int sse_detect_encodings(const char * buf, size_t len) {
 
     __m128i currentmax = _mm_setzero_si128();
 
-    checker check{};
+    checker check {};
 
-    while(buf + 64 <= end) {
+    while (buf + 64 <= end) {
         __m128i in = _mm_loadu_si128((__m128i*)buf);
-        __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
-        __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-        __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
+        __m128i secondin = _mm_loadu_si128((__m128i*)buf + 1);
+        __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
+        __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
 
         const auto u0 = simd16<uint16_t>(in);
         const auto u1 = simd16<uint16_t>(secondin);
@@ -24611,15 +28006,15 @@ int sse_detect_encodings(const char * buf, size_t len) {
                 is_utf32 = false;
                 // Code from sse_validate_utf16le.cpp
                 // Not efficient, we do not process surrogates_bitmask1
-                const char16_t * input = reinterpret_cast<const char16_t*>(buf);
-                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len/2;
+                const char16_t* input = reinterpret_cast<const char16_t*>(buf);
+                const char16_t* end16 = reinterpret_cast<const char16_t*>(start) + len / 2;
 
                 const auto v_fc = simd8<uint8_t>::splat(0xfc);
                 const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
                 const uint16_t V0 = static_cast<uint16_t>(~surrogates_bitmask0);
 
-                const auto    vH0 = (in16 & v_fc) == v_dc;
+                const auto vH0 = (in16 & v_fc) == v_dc;
                 const uint16_t H0 = static_cast<uint16_t>(vH0.to_bitmask());
 
                 const uint16_t L0 = static_cast<uint16_t>(~H0 & surrogates_bitmask0);
@@ -24655,7 +28050,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
                     } else {
                         const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
-                        const auto    vH = (in_16 & v_fc) == v_dc;
+                        const auto vH = (in_16 & v_fc) == v_dc;
                         const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
                         const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
@@ -24680,8 +28075,8 @@ int sse_detect_encodings(const char * buf, size_t len) {
                 is_utf16 = false;
                 // Check for UTF-32
                 if (len % 4 == 0) {
-                    const char32_t * input = reinterpret_cast<const char32_t*>(buf);
-                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len/4;
+                    const char32_t* input = reinterpret_cast<const char32_t*>(buf);
+                    const char32_t* end32 = reinterpret_cast<const char32_t*>(start) + len / 4;
 
                     // Must start checking for surrogates
                     __m128i currentoffsetmax = _mm_setzero_si128();
@@ -24699,14 +28094,14 @@ int sse_detect_encodings(const char * buf, size_t len) {
                     currentoffsetmax = _mm_max_epu32(_mm_add_epi32(fourthin, offset), currentoffsetmax);
 
                     while (input + 4 < end32) {
-                        const __m128i in32 = _mm_loadu_si128((__m128i *)input);
-                        currentmax = _mm_max_epu32(in32,currentmax);
+                        const __m128i in32 = _mm_loadu_si128((__m128i*)input);
+                        currentmax = _mm_max_epu32(in32, currentmax);
                         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in32, offset), currentoffsetmax);
                         input += 4;
                     }
 
                     __m128i forbidden_words = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-                    if(_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
+                    if (_mm_testz_si128(forbidden_words, forbidden_words) == 0) {
                         is_utf32 = false;
                     }
                 } else {
@@ -24735,7 +28130,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
 
     if (is_utf8) {
         if (static_cast<size_t>(buf - start) != len) {
-            uint8_t block[64]{};
+            uint8_t block[64] {};
             std::memset(block, 0x20, 64);
             std::memcpy(block, buf, len - (buf - start));
             simd::simd8x64<uint8_t> in(block);
@@ -24746,14 +28141,14 @@ int sse_detect_encodings(const char * buf, size_t len) {
         }
     }
 
-    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start))/2)) {
+    if (is_utf16 && scalar::utf16::validate<endianness::LITTLE>(reinterpret_cast<const char16_t*>(buf), (len - (buf - start)) / 2)) {
         out |= simdutf::encoding_type::UTF16_LE;
     }
 
     if (is_utf32 && (len % 4 == 0)) {
         const __m128i standardmax = _mm_set1_epi32(0x10ffff);
         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start))/4)) {
+        if (_mm_testz_si128(is_zero, is_zero) == 1 && scalar::utf32::validate(reinterpret_cast<const char32_t*>(buf), (len - (buf - start)) / 4)) {
             out |= simdutf::encoding_type::UTF32_LE;
         }
     }
@@ -24762,7 +28157,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
 }
 /* end file src/westmere/sse_detect_encodings.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_validate_utf16.cpp
 /* begin file src/westmere/sse_validate_utf16.cpp */
 /*
     In UTF-16 words in range 0xD800 to 0xDFFF have special meaning.
@@ -24809,8 +28204,9 @@ int sse_detect_encodings(const char * buf, size_t len) {
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-template <endianness big_endian>
-const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const char16_t* sse_validate_utf16(const char16_t* input, size_t size)
+{
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -24851,19 +28247,19 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
+                                                                    // (A low surrogate placed in the 7th register's word
+                                                                    // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
+                                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
 
             if (c == 0xffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -24884,9 +28280,9 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
     return input;
 }
 
-
-template <endianness big_endian>
-const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
+template<endianness big_endian>
+const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
+{
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -24929,19 +28325,19 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
             const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
             // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-            const auto    vH = (in & v_fc) == v_dc;
+            const auto vH = (in & v_fc) == v_dc;
             const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
             // L - word mask for low surrogates
             //     L = not H and surrogates_wordmask
             const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-            const uint16_t a = static_cast<uint16_t>(L & (H >> 1));  // A low surrogate must be followed by high one.
-                                              // (A low surrogate placed in the 7th register's word
-                                              // is an exception we handle.)
-            const uint16_t b = static_cast<uint16_t>(a << 1);        // Just mark that the opinput - startite fact is hold,
-                                              // thanks to that we have only two masks for valid case.
-            const uint16_t c = static_cast<uint16_t>(V | a | b);     // Combine all the masks into the final one.
+            const uint16_t a = static_cast<uint16_t>(L & (H >> 1)); // A low surrogate must be followed by high one.
+                                                                    // (A low surrogate placed in the 7th register's word
+                                                                    // is an exception we handle.)
+            const uint16_t b = static_cast<uint16_t>(a << 1); // Just mark that the opinput - startite fact is hold,
+                                                              // thanks to that we have only two masks for valid case.
+            const uint16_t c = static_cast<uint16_t>(V | a | b); // Combine all the masks into the final one.
 
             if (c == 0xffff) {
                 // The whole input register contains valid UTF-16, i.e.,
@@ -24962,13 +28358,14 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
     return result(error_code::SUCCESS, input - start);
 }
 /* end file src/westmere/sse_validate_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_validate_utf32le.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_validate_utf32le.cpp
 /* begin file src/westmere/sse_validate_utf32le.cpp */
 /* Returns:
    - pointer to the last unprocessed character (a scalar fallback should check the rest);
    - nullptr if an error was detected.
 */
-const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
+const char32_t* sse_validate_utf32le(const char32_t* input, size_t size)
+{
     const char32_t* end = input + size;
 
     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
@@ -24978,26 +28375,26 @@ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
     __m128i currentoffsetmax = _mm_setzero_si128();
 
     while (input + 4 < end) {
-        const __m128i in = _mm_loadu_si128((__m128i *)input);
-        currentmax = _mm_max_epu32(in,currentmax);
+        const __m128i in = _mm_loadu_si128((__m128i*)input);
+        currentmax = _mm_max_epu32(in, currentmax);
         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
         input += 4;
     }
     __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-    if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-    if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
         return nullptr;
     }
 
     return input;
 }
 
-
-const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size)
+{
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -25008,17 +28405,17 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size
     __m128i currentoffsetmax = _mm_setzero_si128();
 
     while (input + 4 < end) {
-        const __m128i in = _mm_loadu_si128((__m128i *)input);
-        currentmax = _mm_max_epu32(in,currentmax);
+        const __m128i in = _mm_loadu_si128((__m128i*)input);
+        currentmax = _mm_max_epu32(in, currentmax);
         currentoffsetmax = _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
 
         __m128i is_zero = _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-        if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+        if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
             return result(error_code::TOO_LARGE, input - start);
         }
 
         is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-        if(_mm_test_all_zeros(is_zero, is_zero) == 0) {
+        if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
             return result(error_code::SURROGATE, input - start);
         }
         input += 4;
@@ -25028,309 +28425,291 @@ const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size
 }
 /* end file src/westmere/sse_validate_utf32le.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf8_to_utf16.cpp
 /* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    __m128i ascii_first = _mm_cvtepu8_epi16(in);
-    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in,8));
-    if (big_endian) {
-      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
-      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
-    }
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8), ascii_second);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian) composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-     __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian) composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    const __m128i composedminus =
-        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-    const __m128i lowtenbits =
-        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-    const __m128i hightenbits = _mm_srli_epi32(composedminus, 10);
-    const __m128i lowtenbitsadd =
-        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-    const __m128i hightenbitsadd =
-        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-    __m128i surrogates =
-        _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (big_endian) {
-      _mm_storeu_si128((__m128i *)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
-      surrogates = _mm_shuffle_epi8(surrogates, swap);
-    }
-    _mm_storeu_si128((__m128i *)basic_buffer, composed);
-    uint32_t surrogate_buffer[4];
-    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
-    for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] < 65536) {
-        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
-        utf16_output++;
-      } else {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
-        utf16_output += 2;
-      }
-    }
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+template<endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char16_t*& utf16_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        __m128i ascii_first = _mm_cvtepu8_epi16(in);
+        __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
+        if (big_endian) {
+            ascii_first = _mm_shuffle_epi8(ascii_first, swap);
+            ascii_second = _mm_shuffle_epi8(ascii_second, swap);
+        }
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output), ascii_first);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf16_output + 8), ascii_second);
+        utf16_output += 16; // We wrote 16 16-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 8; // We wrote 16 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 2-byte UTF-16 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        if (big_endian)
+            composed = _mm_shuffle_epi8(composed, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed);
+        utf16_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+        if (big_endian)
+            composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+        _mm_storeu_si128((__m128i*)utf16_output, composed_repacked);
+        utf16_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        //////////////
+        // There might be garbage inputs where a leading byte mascarades as a four-byte
+        // leading byte (by being followed by 3 continuation byte), but is not greater than
+        // 0xf0. This could trigger a buffer overflow if we only counted leading
+        // bytes of the form 0xf0 as generating surrogate pairs, without further UTF-8 validation.
+        // Thus we must be careful to ensure that only leading bytes at least as large as 0xf0 generate surrogate pairs.
+        // We do as at the cost of an extra mask.
+        /////////////
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        // We deliberately carry the leading four bits in highbyte if they are present,
+        // we remove them later when computing hightenbits.
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        // When we need to generate a surrogate pair (leading byte > 0xF0), then
+        // the corresponding 32-bit value in 'composed'  will be greater than
+        // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+        // location of the surrogate pairs.
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        const __m128i composedminus = _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+        const __m128i lowtenbits = _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+        // Notice the 0x3ff mask:
+        const __m128i hightenbits = _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+        const __m128i lowtenbitsadd = _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+        const __m128i hightenbitsadd = _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+        const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+        __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+        uint32_t basic_buffer[4];
+        uint32_t basic_buffer_swap[4];
+        if (big_endian) {
+            _mm_storeu_si128((__m128i*)basic_buffer_swap, _mm_shuffle_epi8(composed, swap));
+            surrogates = _mm_shuffle_epi8(surrogates, swap);
+        }
+        _mm_storeu_si128((__m128i*)basic_buffer, composed);
+        uint32_t surrogate_buffer[4];
+        _mm_storeu_si128((__m128i*)surrogate_buffer, surrogates);
+        for (size_t i = 0; i < 3; i++) {
+            if (basic_buffer[i] > 0x3c00000) {
+                utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+                utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+                utf16_output += 2;
+            } else {
+                utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i]) : uint16_t(basic_buffer[i]);
+                utf16_output++;
+            }
+        }
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf8_to_utf32.cpp
 /* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
-
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                           uint64_t utf8_end_of_code_point_mask,
-                           char32_t *&utf32_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it is maybe
-  // beneficial to have fast paths that depend on branch prediction but have less latency.
-  // This results in more instructions but, potentially, also higher speeds.
-  //
-  // We first try a few fast paths.
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
-    // We process the data in chunks of 16 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu8_epi32(in));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu8_epi32(_mm_srli_si128(in,4)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+8), _mm_cvtepu8_epi32(_mm_srli_si128(in,8)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+12), _mm_cvtepu8_epi32(_mm_srli_si128(in,12)));
-    utf32_output += 16; // We wrote 16 32-bit characters.
-    return 16; // We consumed 16 bytes.
-  }
-  if(((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
-    utf32_output += 8; // We wrote 32 bytes, 8 code points.
-    return 16;
-  }
-  if(input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
-    // There is probably a more efficient sequence, but the following might do.
-    const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-    return 12;
-  }
-  /// We do not have a fast path available, so we fallback.
-
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  if (idx < 64) {
-    // SIX (6) input code-words
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-words. The max length in bytes of six code
-    // words spanning between 1 and 2 bytes each is 12 bytes. On processors
-    // where pdep/pext is fast, we might be able to use a small lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(composed,8)));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
-  } else if (idx < 145) {
-    // FOUR (4) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-  } else if (idx < 209) {
-    // TWO (2) input code-words
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 3;
-  } else {
-    // here we know that there is an error but we do not handle errors
-  }
-  return consumed;
+size_t convert_masked_utf8_to_utf32(const char* input,
+    uint64_t utf8_end_of_code_point_mask,
+    char32_t*& utf32_output)
+{
+    // we use an approach where we try to process up to 12 input bytes.
+    // Why 12 input bytes and not 16? Because we are concerned with the size of
+    // the lookup tables. Also 12 is nicely divisible by two and three.
+    //
+    //
+    // Optimization note: our main path below is load-latency dependent. Thus it is maybe
+    // beneficial to have fast paths that depend on branch prediction but have less latency.
+    // This results in more instructions but, potentially, also higher speeds.
+    //
+    // We first try a few fast paths.
+    const __m128i in = _mm_loadu_si128((__m128i*)input);
+    const uint16_t input_utf8_end_of_code_point_mask = utf8_end_of_code_point_mask & 0xfff;
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xffff)) {
+        // We process the data in chunks of 16 bytes.
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu8_epi32(in));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 8), _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 12), _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
+        utf32_output += 16; // We wrote 16 32-bit characters.
+        return 16; // We consumed 16 bytes.
+    }
+    if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+        // We want to take 8 2-byte UTF-8 words and turn them into 8 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(composed));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+        utf32_output += 8; // We wrote 32 bytes, 8 code points.
+        return 16;
+    }
+    if (input_utf8_end_of_code_point_mask == 0x924) {
+        // We want to take 4 3-byte UTF-8 words and turn them into 4 4-byte UTF-32 words.
+        // There is probably a more efficient sequence, but the following might do.
+        const __m128i sh = _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+        return 12;
+    }
+    /// We do not have a fast path available, so we fallback.
+
+    const uint8_t idx = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+    const uint8_t consumed = tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+    if (idx < 64) {
+        // SIX (6) input code-words
+        // this is a relatively easy scenario
+        // we process SIX (6) input code-words. The max length in bytes of six code
+        // words spanning between 1 and 2 bytes each is 12 bytes. On processors
+        // where pdep/pext is fast, we might be able to use a small lookup table.
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+        const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(composed));
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+        utf32_output += 6; // We wrote 12 bytes, 6 code points.
+    } else if (idx < 145) {
+        // FOUR (4) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 4;
+    } else if (idx < 209) {
+        // TWO (2) input code-words
+        const __m128i sh = _mm_loadu_si128((const __m128i*)tables::utf8_to_utf16::shufutf8[idx]);
+        const __m128i perm = _mm_shuffle_epi8(in, sh);
+        const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+        const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+        const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+        __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+        // correct for spurious high bit
+        const __m128i correct = _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+        middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+        const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+        const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+        const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+        const __m128i composed = _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+            _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+        _mm_storeu_si128((__m128i*)utf32_output, composed);
+        utf32_output += 3;
+    } else {
+        // here we know that there is an error but we do not handle errors
+    }
+    return consumed;
 }
 /* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf16_to_utf8.cpp
 /* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -25385,480 +28764,485 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output) {
+template<endianness big_endian>
+std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output)
+{
 
-  const char16_t* end = buf + len;
+    const char16_t* end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+    while (buf + 16 + safety_margin <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
         if (big_endian) {
-          const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          nextin = _mm_shuffle_epi8(nextin, swap);
-        }
-        if(!_mm_testz_si128(nextin, v_ff80)) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,in);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m128i t0 = _mm_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m128i t1 = _mm_and_si128(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m128i t2 = _mm_and_si128(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m128i t3 = _mm_or_si128(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m128i s0 = _mm_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                              (one_or_two_bytes_bitmask & 0xaaaa);
-        if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+        if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+            __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                nextin = _mm_shuffle_epi8(nextin, swap);
+            }
+            if (!_mm_testz_si128(nextin, v_ff80)) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, in);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        // no bits set above 7th bit
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(buf, utf8_output);
+    return std::make_pair(buf, utf8_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if(_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-        __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
+template<endianness big_endian>
+std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
         if (big_endian) {
-          const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-          nextin = _mm_shuffle_epi8(nextin, swap);
-        }
-        if(!_mm_testz_si128(nextin, v_ff80)) {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,in);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 8;
-          utf8_output += 8;
-          in = nextin;
-        } else {
-          // 1. pack the bytes
-          // obviously suboptimal.
-          const __m128i utf8_packed = _mm_packus_epi16(in,nextin);
-          // 2. store (16 bytes)
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-          // 3. adjust pointers
-          buf += 16;
-          utf8_output += 16;
-          continue; // we are done for this round!
-        }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-          // 1. prepare 2-byte values
-          // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-          // expected output   : [110a|aaaa|10bb|bbbb] x 8
-          const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-          const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-          // t0 = [000a|aaaa|bbbb|bb00]
-          const __m128i t0 = _mm_slli_epi16(in, 2);
-          // t1 = [000a|aaaa|0000|0000]
-          const __m128i t1 = _mm_and_si128(t0, v_1f00);
-          // t2 = [0000|0000|00bb|bbbb]
-          const __m128i t2 = _mm_and_si128(in, v_003f);
-          // t3 = [000a|aaaa|00bb|bbbb]
-          const __m128i t3 = _mm_or_si128(t1, t2);
-          // t4 = [110a|aaaa|10bb|bbbb]
-          const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-          // 2. merge ASCII and 2-byte codewords
-          const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
-
-          // 3. prepare bitmask for 8-bit lookup
-          //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-          const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-          const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-          const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-          // 4. pack the bytes
-          const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-          const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-          const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-          // 5. store bytes
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-          // 6. adjust pointers
-          buf += 8;
-          utf8_output += row[0];
-          continue;
-
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-        const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-        /* In this branch we handle three cases:
-           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-          We expand the input word (16-bit) into two words (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
-
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
-
-          We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-          either byte 1 for case #2 or byte 2 for case #3. Note that they
-          differ by exactly one bit.
-
-          Finally from these two words we build proper UTF-8 sequence, taking
-          into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-        // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-        const __m128i s0 = _mm_srli_epi16(in, 4);
-        // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-        const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-        // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-        const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-        // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-        const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-        const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-        // 4. expand words 16-bit => 32-bit
-        const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-        const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-        // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                              (one_or_two_bytes_bitmask & 0xaaaa);
-        if(mask == 0) {
-          // We only have three-byte words. Use fast path.
-          const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-          const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-          const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-          utf8_output += 12;
-          _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }
-        const uint8_t mask0 = uint8_t(mask);
-
-        const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-        const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += row0[0];
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += row1[0];
-
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word & 0xFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xF800 ) != 0xD800) {
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
+        // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+        const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+        if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+            __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                nextin = _mm_shuffle_epi8(nextin, swap);
+            }
+            if (!_mm_testz_si128(nextin, v_ff80)) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, in);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                in = nextin;
+            } else {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        // no bits set above 7th bit
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+               1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+               2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+               3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value>>18) | 0b11110000);
-          *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xF800) != 0xD800) {
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf8_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf8_output++ = char((value >> 18) | 0b11110000);
+                    *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((value & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
+    } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf16_to_utf32.cpp
 /* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
@@ -25913,754 +29297,816 @@ std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* b
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
-template <endianness big_endian>
-std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* end = buf + len;
+template<endianness big_endian>
+std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* end = buf + len;
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-  while (buf + 16 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
+    while (buf + 16 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
 
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit words to 32-bit words
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
-        utf32_output += 8;
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+        if (big_endian) {
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
+
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: no surrogate pair, extend 16-bit words to 32-bit words
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(in));
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(nullptr, utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(nullptr, utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(buf, utf32_output);
+    } // while
+    return std::make_pair(buf, utf32_output);
 }
 
-
 /*
   Returns a pair: a result struct and utf8_output.
   If there is an error, the count field of the result is the position of the error.
   Otherwise, it is the position of the first unprocessed byte in buf (even if finished).
   A scalar routing should carry on the conversion of the tail if needed.
 */
-template <endianness big_endian>
-std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) {
-  const char16_t* start = buf;
-  const char16_t* end = buf + len;
+template<endianness big_endian>
+std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output)
+{
+    const char16_t* start = buf;
+    const char16_t* end = buf + len;
+
+    const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+    const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+    while (buf + 16 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
 
-  while (buf + 16 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
+        if (big_endian) {
+            const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+            in = _mm_shuffle_epi8(in, swap);
+        }
 
-    if (big_endian) {
-      const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
-    }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
-    // it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit words to 32-bit words
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output), _mm_cvtepu16_epi32(in));
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output+4), _mm_cvtepu16_epi32(_mm_srli_si128(in,8)));
-        utf32_output += 8;
-        buf += 8;
-    // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
-        if((word &0xF800 ) != 0xD800) {
-          *utf32_output++ = char32_t(word);
+        // 1. Check if there are any surrogate word in the input chunk.
+        //    We have also deal with situation when there is a surrogate word
+        //    at the end of a chunk.
+        const __m128i surrogates_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+
+        // bitmask = 0x0000 if there are no surrogates
+        //         = 0xc000 if the last word is a surrogate
+        const uint16_t surrogates_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+        // It might seem like checking for surrogates_bitmask == 0xc000 could help. However,
+        // it is likely an uncommon occurrence.
+        if (surrogates_bitmask == 0x0000) {
+            // case: no surrogate pair, extend 16-bit words to 32-bit words
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output), _mm_cvtepu16_epi32(in));
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(utf32_output + 4), _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+            utf32_output += 8;
+            buf += 8;
+            // surrogate pair(s) in a register
         } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k+1]) : buf[k+1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if((diff | diff2) > 0x3FF)  { return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output); }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+                if ((word & 0xF800) != 0xD800) {
+                    *utf32_output++ = char32_t(word);
+                } else {
+                    // must be a surrogate pair
+                    uint16_t diff = uint16_t(word - 0xD800);
+                    uint16_t next_word = big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+                    k++;
+                    uint16_t diff2 = uint16_t(next_word - 0xDC00);
+                    if ((diff | diff2) > 0x3FF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k - 1), utf32_output);
+                    }
+                    uint32_t value = (diff << 10) + diff2 + 0x10000;
+                    *utf32_output++ = char32_t(value);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
-    }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+    } // while
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
 /* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf32_to_utf8.cpp
 /* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) {
-
-  const char32_t* end = buf + len;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-  __m128i running_max = _mm_setzero_si128();
-  __m128i forbidden_bytemask = _mm_setzero_si128();
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-    running_max = _mm_max_epu32(_mm_max_epu32(in, running_max), nextin);
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-    // Check for ASCII fast path
-    if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
-      running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin);
-      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
-      if(!_mm_testz_si128(nextin_16, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        // Proceed with next input
-        in_16 = nextin_16;
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 = _mm_and_si128(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-      const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-
-    // Check for overflow in packing
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
-
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                            (one_or_two_bytes_bitmask & 0xaaaa);
-      if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-
-      buf += 8;
-    } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xFFFF0000 )==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
-
-  // check for invalid input
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-  if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf8_output); }
-
-  return std::make_pair(buf, utf8_output);
-}
-
-
-std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-
-  const char32_t* end = buf + len;
-  const char32_t* start = buf;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-
-  const size_t safety_margin = 11; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-
-  while (buf + 16 + safety_margin <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-
-    // Check for too large input
-    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
-    if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
-    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-    // Check for ASCII fast path
-    if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
-      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
-      if(!_mm_testz_si128(nextin_16, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        // Proceed with next input
-        in_16 = nextin_16;
-        __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
-        if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-        }
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 = _mm_and_si128(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555;  // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 = static_cast<uint16_t>(m0 >> 7);                    // m1 = 00000000h0g0f0e0
-      const uint8_t  m2 = static_cast<uint8_t>((m0 | m1) & 0xff);           // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-
-    // Check for overflow in packing
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-
-    if (saturation_bitmask == 0xffff) {
-      // case: words from register produce either 1, 2 or 3 UTF-8 bytes
-
-      // Check for illegal surrogate words
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
-      }
-
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
-
-        We expand the input word (16-bit) into two words (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two words we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128 (t1, vec(0b1000000000000000));
-
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef vec
-
-      // 4. expand words 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
-
-      // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask = (one_byte_bitmask & 0x5555) |
-                            (one_or_two_bytes_bitmask & 0xaaaa);
-      if(mask == 0) {
-        // We only have three-byte words. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
-      const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-
-      const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
-      utf8_output += row1[0];
-
-      buf += 8;
-    } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFFFF80)==0) {
-          *utf8_output++ = char(word);
-        } else if((word & 0xFFFFF800)==0) {
-          *utf8_output++ = char((word>>6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if((word &0xFFFF0000 )==0) {
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output); }
-          *utf8_output++ = char((word>>12) | 0b11100000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output)
+{
+    const char32_t* end = buf + len;
+
+    const __m128i v_0000 = _mm_setzero_si128(); //__m128 = 128 bits
+    const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000 0000
+    const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000 0000
+    const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000 0000
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
+    const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
+    __m128i running_max = _mm_setzero_si128();
+    __m128i forbidden_bytemask = _mm_setzero_si128();
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
+
+    while (buf + 16 + safety_margin <= end) { // buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
+        // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1); // These two values can hold only 8 UTF32 chars
+        running_max = _mm_max_epu32(
+            _mm_max_epu32(in, running_max), // take element-wise max char32_t from in and running_max vector
+            nextin); // and take element-wise max element from nextin and running_max vector
+
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m128i in_16 = _mm_packus_epi32(
+            _mm_and_si128(in, v_7fffffff),
+            _mm_and_si128(nextin, v_7fffffff)); // in this context pack the two __m128 into a single
+        // By ensuring the highest bit is set to 0(&v_7fffffff), we're making sure all values are interpreted as non-negative, or specifically, the values are within the range of valid Unicode code points.
+        // remember : having leading byte 0 means a positive number by the two complements system. Unicode is well beneath the range where you'll start getting issues so that's OK.
+
+        // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+        // Check for ASCII fast path
+
+        // ASCII fast path!!!!
+        // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+        // The intuition is that we try to collect 16 ASCII characters which requires
+        // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
+        // as our new inputs.
+        if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
+            __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
+            __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
+            running_max = _mm_max_epu32(_mm_max_epu32(thirdin, running_max), fourthin); // take the running max of all 4 vectors thus far
+            __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff)); // pack into 1 vector, now you have two
+            if (!_mm_testz_si128(nextin_16, v_ff80)) { // checks if the second packed vector is ASCII, if not:
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16); // creates two copy of in_16 in 1 vector
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed); // put them into the output
+                // 3. adjust pointers
+                buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32 bits =  256 bits
+                utf8_output += 8; // same with output, e.g. lift the first two blocks alone.
+                // Proceed with next input
+                in_16 = nextin_16;
+                // We need to update in and nextin because they are used later.
+                in = thirdin;
+                nextin = fourthin;
+            } else {
+                // 1. pack the bytes
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
+        }
+
+        // no bits set above 7th bit -- find out all the ASCII characters
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
+            _mm_and_si128(in_16, v_ff80), // the vector that get only the first 9 bits of each 16-bit/2-byte units
+            v_0000 //
+        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is of format 0000 0000 0000 0XXX XXXX
+        // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and 0000 0000 0000 0000 if not for each 16-bit/2-byte units
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask)); // collect the MSB from previous vector and put them into uint16_t mas
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in_16, v_003f); // potential second utf8 byte
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2); // first and second potential utf8 byte together
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // Check for overflow in packing
+
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+        if (saturation_bitmask == 0xffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
+
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+                1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
         } else {
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf- start + k), utf8_output); }
-          *utf8_output++ = char((word>>18) | 0b11110000);
-          *utf8_output++ = char(((word>>12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+            // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
+    } // while
+
+    // check for invalid input
+    const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+    if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(_mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
+        return std::make_pair(nullptr, utf8_output);
     }
-  } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+    if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf8_output);
+    }
+
+    return std::make_pair(buf, utf8_output);
 }
-/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
-/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output) {
 
-  const char32_t* end = buf + len;
+std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output)
+{
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
-  __m128i forbidden_bytemask = _mm_setzero_si128();
+    const char32_t* end = buf + len;
+    const char32_t* start = buf;
 
-  while (buf + 8 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+    const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
+    const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+    const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
+    const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
 
-    // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+    const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+    while (buf + 16 + safety_margin <= end) {
+        // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
 
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
+        // Check for too large input
+        __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
+        if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
+            return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+        }
 
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(nullptr, utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(nullptr, utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+        // Pack 32-bit UTF-32 words to 16-bit UTF-16 words with unsigned saturation
+        __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff), _mm_and_si128(nextin, v_7fffffff));
+
+        // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+
+        // Check for ASCII fast path
+        if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
+            // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+            // The intuition is that we try to collect 16 ASCII characters which requires
+            // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
+            // as our new inputs.
+            __m128i thirdin = _mm_loadu_si128((__m128i*)buf + 2);
+            __m128i fourthin = _mm_loadu_si128((__m128i*)buf + 3);
+            __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
+            if (!_mm_testz_si128(nextin_16, v_ff80)) {
+                // 1. pack the bytes
+                // obviously suboptimal.
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 8;
+                utf8_output += 8;
+                // Proceed with next input
+                in_16 = nextin_16;
+                __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
+                if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
+                    return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
+                }
+                // We need to update in and nextin because they are used later.
+                in = thirdin;
+                nextin = fourthin;
+            } else {
+                // 1. pack the bytes
+                const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+                // 2. store (16 bytes)
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+                // 3. adjust pointers
+                buf += 16;
+                utf8_output += 16;
+                continue; // we are done for this round!
+            }
         }
-      }
-      buf += k;
-    }
-  }
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) { return std::make_pair(nullptr, utf16_output); }
+        // no bits set above 7th bit
+        const __m128i one_byte_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
+        const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+        // no bits set above 11th bit
+        const __m128i one_or_two_bytes_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+        const uint16_t one_or_two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+        if (one_or_two_bytes_bitmask == 0xffff) {
+            // case: all words either produce 1 or 2 UTF-8 bytes (at least one produces 2 bytes)
+            // 1. prepare 2-byte values
+            // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+            // expected output   : [110a|aaaa|10bb|bbbb] x 8
+            const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+            const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+            // t0 = [000a|aaaa|bbbb|bb00]
+            const __m128i t0 = _mm_slli_epi16(in_16, 2);
+            // t1 = [000a|aaaa|0000|0000]
+            const __m128i t1 = _mm_and_si128(t0, v_1f00);
+            // t2 = [0000|0000|00bb|bbbb]
+            const __m128i t2 = _mm_and_si128(in_16, v_003f);
+            // t3 = [000a|aaaa|00bb|bbbb]
+            const __m128i t3 = _mm_or_si128(t1, t2);
+            // t4 = [110a|aaaa|10bb|bbbb]
+            const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+            // 2. merge ASCII and 2-byte codewords
+            const __m128i utf8_unpacked = _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
+
+            // 3. prepare bitmask for 8-bit lookup
+            //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a - LSB)
+            const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+            const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+            const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
+            // 4. pack the bytes
+            const uint8_t* row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+            const __m128i shuffle = _mm_loadu_si128((__m128i*)(row + 1));
+            const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+            // 5. store bytes
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
+
+            // 6. adjust pointers
+            buf += 8;
+            utf8_output += row[0];
+            continue;
+        }
+
+        // Check for overflow in packing
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-  return std::make_pair(buf, utf16_output);
+        if (saturation_bitmask == 0xffff) {
+            // case: words from register produce either 1, 2 or 3 UTF-8 bytes
+
+            // Check for illegal surrogate words
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf8_output);
+            }
+
+            const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+            /* In this branch we handle three cases:
+                1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single UFT-8 byte
+                2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two UTF-8 bytes
+                3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three UTF-8 bytes
+
+              We expand the input word (16-bit) into two words (32-bit), thus
+              we have room for four bytes. However, we need five distinct bit
+              layouts. Note that the last byte in cases #2 and #3 is the same.
+
+              We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+              in register t2.
+
+              We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+              either byte 1 for case #2 or byte 2 for case #3. Note that they
+              differ by exactly one bit.
+
+              Finally from these two words we build proper UTF-8 sequence, taking
+              into account the case (i.e, the number of bytes to write).
+            */
+            /**
+             * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+             * t2 => [0ccc|cccc] [10cc|cccc]
+             * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+             */
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
+            // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+            const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
+            // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+            const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+            // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+            const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+
+            // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+            const __m128i s0 = _mm_srli_epi16(in_16, 4);
+            // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+            const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
+            // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+            const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
+            // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+            const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+            const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask, simdutf_vec(0b0100000000000000));
+            const __m128i s4 = _mm_xor_si128(s3, m0);
+#undef simdutf_vec
+
+            // 4. expand words 16-bit => 32-bit
+            const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+            const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+
+            // 5. compress 32-bit words into 1, 2 or 3 bytes -- 2 x shuffle
+            const uint16_t mask = (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+            if (mask == 0) {
+                // We only have three-byte words. Use fast path.
+                const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, -1, -1, -1, -1);
+                const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+                const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+                utf8_output += 12;
+                _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+                utf8_output += 12;
+                buf += 8;
+                continue;
+            }
+            const uint8_t mask0 = uint8_t(mask);
+
+            const uint8_t* row0 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+            const __m128i shuffle0 = _mm_loadu_si128((__m128i*)(row0 + 1));
+            const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+
+            const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
+            const uint8_t* row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+            const __m128i shuffle1 = _mm_loadu_si128((__m128i*)(row1 + 1));
+            const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_0);
+            utf8_output += row0[0];
+            _mm_storeu_si128((__m128i*)utf8_output, utf8_1);
+            utf8_output += row1[0];
+
+            buf += 8;
+        } else {
+            // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=> will produce four UTF-8 bytes
+            // Let us do a scalar fallback.
+            // It may seem wasteful to use scalar code, but being efficient with SIMD
+            // in the presence of surrogate pairs may require non-trivial tables.
+            size_t forward = 15;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFFFF80) == 0) {
+                    *utf8_output++ = char(word);
+                } else if ((word & 0xFFFFF800) == 0) {
+                    *utf8_output++ = char((word >> 6) | 0b11000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else if ((word & 0xFFFF0000) == 0) {
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 12) | 0b11100000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                } else {
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                    }
+                    *utf8_output++ = char((word >> 18) | 0b11110000);
+                    *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+                    *utf8_output++ = char((word & 0b111111) | 0b10000000);
+                }
+            }
+            buf += k;
+        }
+    } // while
+
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
+/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=westmere/sse_convert_utf32_to_utf16.cpp
+/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
+template<endianness big_endian>
+std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
 
+    const char32_t* end = buf + len;
 
-template <endianness big_endian>
-std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) {
-  const char32_t* start = buf;
-  const char32_t* end = buf + len;
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+    __m128i forbidden_bytemask = _mm_setzero_si128();
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+    while (buf + 8 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+        // Check if no bits set above 16th
+        if (saturation_bitmask == 0xffff) {
+            // Pack UTF-32 to UTF-16
+            __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+            const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            forbidden_bytemask = _mm_or_si128(forbidden_bytemask, _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
 
-  while (buf + 8 <= end) {
-    __m128i in = _mm_loadu_si128((__m128i*)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
+        } else {
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(nullptr, utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
+        }
+    }
 
-    // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+    // check for invalid input
+    if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+        return std::make_pair(nullptr, utf16_output);
+    }
 
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
-      }
+    return std::make_pair(buf, utf16_output);
+}
 
-      if (big_endian) {
-        const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
+template<endianness big_endian>
+std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output)
+{
+    const char32_t* start = buf;
+    const char32_t* end = buf + len;
 
-      _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if(size_t(end - buf) < forward + 1) { forward = size_t(end - buf - 1);}
-      for(; k < forward; k++) {
-        uint32_t word = buf[k];
-        if((word & 0xFFFF0000)==0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) { return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output); }
-          *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+    const __m128i v_0000 = _mm_setzero_si128();
+    const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+
+    while (buf + 8 <= end) {
+        __m128i in = _mm_loadu_si128((__m128i*)buf);
+        __m128i nextin = _mm_loadu_si128((__m128i*)buf + 1);
+        const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
+        const uint32_t saturation_bitmask = static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+        // Check if no bits set above 16th
+        if (saturation_bitmask == 0xffff) {
+            // Pack UTF-32 to UTF-16
+            __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+            const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+            const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+            const __m128i forbidden_bytemask = _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
+            if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+                return std::make_pair(result(error_code::SURROGATE, buf - start), utf16_output);
+            }
+
+            if (big_endian) {
+                const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+            }
+
+            _mm_storeu_si128((__m128i*)utf16_output, utf16_packed);
+            utf16_output += 8;
+            buf += 8;
         } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) { return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output); }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
+            size_t forward = 7;
+            size_t k = 0;
+            if (size_t(end - buf) < forward + 1) {
+                forward = size_t(end - buf - 1);
+            }
+            for (; k < forward; k++) {
+                uint32_t word = buf[k];
+                if ((word & 0xFFFF0000) == 0) {
+                    // will not generate a surrogate pair
+                    if (word >= 0xD800 && word <= 0xDFFF) {
+                        return std::make_pair(result(error_code::SURROGATE, buf - start + k), utf16_output);
+                    }
+                    *utf16_output++ = big_endian ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8)) : char16_t(word);
+                } else {
+                    // will generate a surrogate pair
+                    if (word > 0x10FFFF) {
+                        return std::make_pair(result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                    }
+                    word -= 0x10000;
+                    uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+                    uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+                    if (big_endian) {
+                        high_surrogate = uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+                        low_surrogate = uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                    }
+                    *utf16_output++ = char16_t(high_surrogate);
+                    *utf16_output++ = char16_t(low_surrogate);
+                }
+            }
+            buf += k;
         }
-      }
-      buf += k;
     }
-  }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+    return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
 /* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
 
@@ -26668,7 +30114,7 @@ std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32
 } // namespace westmere
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/buf_block_reader.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/buf_block_reader.h
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
 namespace westmere {
@@ -26678,92 +30124,110 @@ namespace {
 template<size_t STEP_SIZE>
 struct buf_block_reader {
 public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
-   * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
-   * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+    simdutf_really_inline buf_block_reader(const uint8_t* _buf, size_t _len);
+    simdutf_really_inline size_t block_index();
+    simdutf_really_inline bool has_full_block() const;
+    simdutf_really_inline const uint8_t* full_block() const;
+    /**
+     * Get the last block, padded with spaces.
+     *
+     * There will always be a last block, with at least 1 byte, unless len == 0 (in which case this
+     * function fills the buffer with spaces and returns 0. In particular, if len == STEP_SIZE there
+     * will be 0 full_blocks and 1 remainder block with STEP_SIZE bytes and no spaces for padding.
+     *
+     * @return the number of effective characters in the last block.
+     */
+    simdutf_really_inline size_t get_remainder(uint8_t* dst) const;
+    simdutf_really_inline void advance();
+
 private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
+    const uint8_t* buf;
+    const size_t len;
+    const size_t lenminusstep;
+    size_t idx;
 };
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text_64(const uint8_t *text) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text_64(const uint8_t* text)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
 // Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char * format_input_text(const simd8x64<uint8_t>& in) {
-  static char *buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t*>(buf));
-  for (size_t i=0; i<sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') { buf[i] = '_'; }
-  }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+simdutf_unused static char* format_input_text(const simd8x64<uint8_t>& in)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+    in.store(reinterpret_cast<uint8_t*>(buf));
+    for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+        if (buf[i] < ' ') {
+            buf[i] = '_';
+        }
+    }
+    buf[sizeof(simd8x64<uint8_t>)] = '\0';
+    return buf;
 }
 
-simdutf_unused static char * format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char*>(malloc(64 + 1));
-  for (size_t i=0; i<64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_unused static char* format_mask(uint64_t mask)
+{
+    static char* buf = reinterpret_cast<char*>(malloc(64 + 1));
+    for (size_t i = 0; i < 64; i++) {
+        buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    }
+    buf[64] = '\0';
+    return buf;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len) : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE}, idx{0} {}
+simdutf_really_inline buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t* _buf, size_t _len)
+    : buf { _buf }
+    , len { _len }
+    , lenminusstep { len < STEP_SIZE ? 0 : len - STEP_SIZE }
+    , idx { 0 }
+{
+}
 
 template<size_t STEP_SIZE>
 simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() { return idx; }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const
+{
+    return idx < lenminusstep;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_really_inline const uint8_t* buf_block_reader<STEP_SIZE>::full_block() const
+{
+    return &buf[idx];
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if(len == idx) { return 0; } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::get_remainder(uint8_t* dst) const
+{
+    if (len == idx) {
+        return 0;
+    } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+    std::memset(dst, 0x20, STEP_SIZE); // std::memset STEP_SIZE because it's more efficient to write out 8 or 16 bytes at once.
+    std::memcpy(dst, buf + idx, len - idx);
+    return len - idx;
 }
 
 template<size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance()
+{
+    idx += STEP_SIZE;
 }
 
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_lookup4_algorithm.h
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
 namespace westmere {
@@ -26772,21 +30236,22 @@ namespace utf8_validation {
 
 using namespace simd;
 
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -26794,101 +30259,92 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
+}
 
-  //
-  // Return nonzero if there are incomplete multibyte characters at the end of the block:
-  // e.g. if there is a 4-byte character, but it's 3 bytes from the end.
-  //
-  simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the block:
+// e.g. if there is a 4-byte character, but it's 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input)
+{
     // If the previous input's last 3 bytes match this, they're too short (they ended at EOF):
     // ... 1111____ 111_____ 11______
     static const uint8_t max_array[32] = {
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 255, 255, 255,
-      255, 255, 255, 255, 255, 0b11110000u-1, 0b11100000u-1, 0b11000000u-1
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 255, 255, 255,
+        255, 255, 255, 255, 255, 0b11110000u - 1, 0b11100000u - 1, 0b11000000u - 1
     };
-    const simd8<uint8_t> max_value(&max_array[sizeof(max_array)-sizeof(simd8<uint8_t>)]);
+    const simd8<uint8_t> max_value(&max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
     return input.gt_bits(max_value);
-  }
+}
 
-  struct utf8_checker {
+struct utf8_checker {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
     // The last input we received
@@ -26899,51 +30355,54 @@ using namespace simd;
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
     }
 
     // The only problem that can happen at EOF is that a multibyte character is too short
     // or a byte value too large in the last bytes: check_special_cases only checks for bytes
     // too large in the first of two bytes.
-    simdutf_really_inline void check_eof() {
-      // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
-      // possibly finish them.
-      this->error |= this->prev_incomplete;
+    simdutf_really_inline void check_eof()
+    {
+        // If the previous block had incomplete UTF-8 characters at the end, an ASCII block can't
+        // possibly finish them.
+        this->error |= this->prev_incomplete;
     }
 
-    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input) {
-      if(simdutf_likely(is_ascii(input))) {
-        this->error |= this->prev_incomplete;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-        static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+    simdutf_really_inline void check_next_input(const simd8x64<uint8_t>& input)
+    {
+        if (simdutf_likely(is_ascii(input))) {
+            this->error |= this->prev_incomplete;
+        } else {
+            // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+            static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                "We support either two or four chunks per 64-byte block.");
+            if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+            } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+                this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+            }
+            this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+            this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
         }
-        this->prev_incomplete = is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1]);
-        this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS-1];
-
-      }
     }
 
     // do not forget to call check_eof!
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // namespace utf8_validation
 
 using utf8_validation::utf8_checker;
@@ -26952,7 +30411,7 @@ using utf8_validation::utf8_checker;
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_validation/utf8_validator.h
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
 namespace westmere {
@@ -26963,15 +30422,16 @@ namespace utf8_validation {
  * Validates that the string is actual UTF-8.
  */
 template<class checker>
-bool generic_validate_utf8(const uint8_t * input, size_t length) {
-    checker c{};
+bool generic_validate_utf8(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
@@ -26980,97 +30440,106 @@ bool generic_validate_utf8(const uint8_t * input, size_t length) {
     return !c.errors();
 }
 
-bool generic_validate_utf8(const char * input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_utf8(const char* input, size_t length)
+{
+    return generic_validate_utf8<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 /**
  * Validates that the string is actual UTF-8 and stops on errors.
  */
 template<class checker>
-result generic_validate_utf8_with_errors(const uint8_t * input, size_t length) {
-    checker c{};
+result generic_validate_utf8_with_errors(const uint8_t* input, size_t length)
+{
+    checker c {};
     buf_block_reader<64> reader(input, length);
-    size_t count{0};
+    size_t count { 0 };
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      c.check_next_input(in);
-      if(c.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-        res.count += count;
-        return res;
-      }
-      reader.advance();
-      count += 64;
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        c.check_next_input(in);
+        if (c.errors()) {
+            if (count != 0) {
+                count--;
+            } // Sometimes the error is only detected in the next chunk
+            result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            res.count += count;
+            return res;
+        }
+        reader.advance();
+        count += 64;
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     c.check_next_input(in);
     reader.advance();
     c.check_eof();
     if (c.errors()) {
-      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
-      res.count += count;
-      return res;
+        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(input) + count, length - count);
+        res.count += count;
+        return res;
     } else {
-      return result(error_code::SUCCESS, length);
+        return result(error_code::SUCCESS, length);
     }
 }
 
-result generic_validate_utf8_with_errors(const char * input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_utf8_with_errors(const char* input, size_t length)
+{
+    return generic_validate_utf8_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-bool generic_validate_ascii(const uint8_t * input, size_t length) {
+bool generic_validate_ascii(const uint8_t* input, size_t length)
+{
     buf_block_reader<64> reader(input, length);
-    uint8_t blocks[64]{};
+    uint8_t blocks[64] {};
     simd::simd8x64<uint8_t> running_or(blocks);
     while (reader.has_full_block()) {
-      simd::simd8x64<uint8_t> in(reader.full_block());
-      running_or |= in;
-      reader.advance();
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        running_or |= in;
+        reader.advance();
     }
-    uint8_t block[64]{};
+    uint8_t block[64] {};
     reader.get_remainder(block);
     simd::simd8x64<uint8_t> in(block);
     running_or |= in;
     return running_or.is_ascii();
 }
 
-bool generic_validate_ascii(const char * input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+bool generic_validate_ascii(const char* input, size_t length)
+{
+    return generic_validate_ascii<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 template<class checker>
-result generic_validate_ascii_with_errors(const uint8_t * input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
+result generic_validate_ascii_with_errors(const uint8_t* input, size_t length)
+{
+    buf_block_reader<64> reader(input, length);
+    size_t count { 0 };
+    while (reader.has_full_block()) {
+        simd::simd8x64<uint8_t> in(reader.full_block());
+        if (!in.is_ascii()) {
+            result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+            return result(res.error, count + res.count);
+        }
+        reader.advance();
+
+        count += 64;
+    }
+    uint8_t block[64] {};
+    reader.get_remainder(block);
+    simd::simd8x64<uint8_t> in(block);
     if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-      return result(res.error, count + res.count);
+        result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
+        return result(res.error, count + res.count);
+    } else {
+        return result(error_code::SUCCESS, length);
     }
-    reader.advance();
-
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(reinterpret_cast<const char*>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
 }
 
-result generic_validate_ascii_with_errors(const char * input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t *>(input),length);
+result generic_validate_ascii_with_errors(const char* input, size_t length)
+{
+    return generic_validate_ascii_with_errors<utf8_checker>(reinterpret_cast<const uint8_t*>(input), length);
 }
 
 } // namespace utf8_validation
@@ -27079,10 +30548,9 @@ result generic_validate_ascii_with_errors(const char * input, size_t length) {
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/valid_utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace westmere {
 namespace {
@@ -27090,63 +30558,64 @@ namespace utf8_to_utf16 {
 
 using namespace simd;
 
-template <endianness endian>
+template<endianness endian>
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char16_t* utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the generic directory.
-  size_t pos = 0;
-  char16_t* start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the mask
-    // far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow path.
-      // Anything that is not a continuation mask is a 'leading byte', that is, the
-      // start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end* of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while(pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block.These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+    char16_t* utf16_output) noexcept
+{
+    // The implementation is not specific to haswell and should be moved to the generic directory.
+    size_t pos = 0;
+    char16_t* start { utf16_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        // this loop could be unrolled further. For example, we could process the mask
+        // far more than 64 bytes.
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf16<endian>(utf16_output);
+            utf16_output += 64;
+            pos += 64;
+        } else {
+            // Slow path. We hope that the compiler will recognize that this is a slow path.
+            // Anything that is not a continuation mask is a 'leading byte', that is, the
+            // start of a new code point.
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            // The *start* of code points is not so useful, rather, we want the *end* of code points.
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            // We process in blocks of up to 12 bytes except possibly
+            // for fast paths which may process up to 16 bytes. For the
+            // slow path to work, we should have at least 12 input bytes left.
+            size_t max_starting_point = (pos + 64) - 12;
+            // Next loop is going to run at least five times when using solely
+            // the slow/regular path, and at least four times if there are fast paths.
+            while (pos < max_starting_point) {
+                // Performance note: our ability to compute 'consumed' and
+                // then shift and recompute is critical. If there is a
+                // latency of, say, 4 cycles on getting 'consumed', then
+                // the inner loop might have a total latency of about 6 cycles.
+                // Yet we process between 6 to 12 inputs bytes, thus we get
+                // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                // for this section of the code. Hence, there is a limit
+                // to how much we can further increase this latency before
+                // it seriously harms performance.
+                //
+                // Thus we may allow convert_masked_utf8_to_utf16 to process
+                // more bytes at a time under a fast-path mode where 16 bytes
+                // are consumed at once (e.g., when encountering ASCII).
+                size_t consumed = convert_masked_utf8_to_utf16<endian>(input + pos,
+                    utf8_end_of_code_point_mask, utf16_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+            // At this point there may remain between 0 and 12 bytes in the
+            // 64-byte block. These bytes will be processed again. So we have an
+            // 80% efficiency (in the worst case). In practice we expect an
+            // 85% to 90% efficiency.
+        }
+    }
+    utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(input + pos, size - pos, utf16_output);
+    return utf16_output - start;
 }
 
 } // namespace utf8_to_utf16
@@ -27154,32 +30623,31 @@ simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf16/utf8_to_utf16.h
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -27187,258 +30655,281 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-    template <endianness endian>
-    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
-        if(howmany == 0) { return 0; }
-        utf16_output += howmany;
-      }
-      return utf16_output - start;
-    }
-
-    template <endianness endian>
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output) {
-      size_t pos = 0;
-      char16_t* start{utf16_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf16<endian>(utf16_output);
-          utf16_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    template<endianness endian>
+    simdutf_really_inline size_t convert(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf16::convert<endian>(in + pos, size - pos, utf16_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf16_output += howmany;
+        }
+        return utf16_output - start;
+    }
+
+    template<endianness endian>
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char16_t* utf16_output)
+    {
+        size_t pos = 0;
+        char16_t* start { utf16_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 8; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the eight last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf16<endian>(utf16_output);
+                utf16_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+                    // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+                    result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
+                        utf8_end_of_code_point_mask, utf16_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
             // with the ability to go back up to pos bytes, and read size-pos bytes forward.
             result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf16<endian>(in + pos,
-                            utf8_end_of_code_point_mask, utf16_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
-        // with the ability to go back up to pos bytes, and read size-pos bytes forward.
-        result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf16_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf16_output - start);
+        if (pos < size) {
+            // rewind_and_convert_with_errors will seek a potential error from in+pos onward,
+            // with the ability to go back up to pos bytes, and read size-pos bytes forward.
+            result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(pos, in + pos, size - pos, utf16_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf16_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf16_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf16 namespace
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/valid_utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
@@ -27448,68 +30939,66 @@ namespace utf8_to_utf32 {
 
 using namespace simd;
 
-
 simdutf_warn_unused size_t convert_valid(const char* input, size_t size,
-    char32_t* utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t* start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while(pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if(in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
-    } else {
-    // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
-    uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-    uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-    uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-    size_t max_starting_point = (pos + 64) - 12;
-    while(pos < max_starting_point) {
-      size_t consumed = convert_masked_utf8_to_utf32(input + pos,
-                          utf8_end_of_code_point_mask, utf32_output);
-      pos += consumed;
-      utf8_end_of_code_point_mask >>= consumed;
-      }
+    char32_t* utf32_output) noexcept
+{
+    size_t pos = 0;
+    char32_t* start { utf32_output };
+    const size_t safety_margin = 16; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+        simd8x64<int8_t> in(reinterpret_cast<const int8_t*>(input + pos));
+        if (in.is_ascii()) {
+            in.store_ascii_as_utf32(utf32_output);
+            utf32_output += 64;
+            pos += 64;
+        } else {
+            // -65 is 0b10111111 in two-complement's, so largest possible continuation byte
+            uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+            uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+            uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+            size_t max_starting_point = (pos + 64) - 12;
+            while (pos < max_starting_point) {
+                size_t consumed = convert_masked_utf8_to_utf32(input + pos,
+                    utf8_end_of_code_point_mask, utf32_output);
+                pos += consumed;
+                utf8_end_of_code_point_mask >>= consumed;
+            }
+        }
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
-  return utf32_output - start;
+    utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos, utf32_output);
+    return utf32_output - start;
 }
 
-
 } // namespace utf8_to_utf32
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8_to_utf32/utf8_to_utf32.h
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
 
-
-  simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-// Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-// Bit 1 = Too Long (ASCII followed by continuation)
-// Bit 2 = Overlong 3-byte
-// Bit 4 = Surrogate
-// Bit 5 = Overlong 2-byte
-// Bit 7 = Two Continuations
-    constexpr const uint8_t TOO_SHORT   = 1<<0; // 11______ 0_______
+simdutf_really_inline simd8<uint8_t> check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1)
+{
+    // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+    // Bit 1 = Too Long (ASCII followed by continuation)
+    // Bit 2 = Overlong 3-byte
+    // Bit 4 = Surrogate
+    // Bit 5 = Overlong 2-byte
+    // Bit 7 = Two Continuations
+    constexpr const uint8_t TOO_SHORT = 1 << 0; // 11______ 0_______
                                                 // 11______ 11______
-    constexpr const uint8_t TOO_LONG    = 1<<1; // 0_______ 10______
-    constexpr const uint8_t OVERLONG_3  = 1<<2; // 11100000 100_____
-    constexpr const uint8_t SURROGATE   = 1<<4; // 11101101 101_____
-    constexpr const uint8_t OVERLONG_2  = 1<<5; // 1100000_ 10______
-    constexpr const uint8_t TWO_CONTS   = 1<<7; // 10______ 10______
-    constexpr const uint8_t TOO_LARGE   = 1<<3; // 11110100 1001____
+    constexpr const uint8_t TOO_LONG = 1 << 1; // 0_______ 10______
+    constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+    constexpr const uint8_t SURROGATE = 1 << 4; // 11101101 101_____
+    constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+    constexpr const uint8_t TWO_CONTS = 1 << 7; // 10______ 10______
+    constexpr const uint8_t TOO_LARGE = 1 << 3; // 11110100 1001____
                                                 // 11110100 101_____
                                                 // 11110101 1001____
                                                 // 11110101 101_____
@@ -27517,251 +31006,273 @@ using namespace simd;
                                                 // 1111011_ 101_____
                                                 // 11111___ 1001____
                                                 // 11111___ 101_____
-    constexpr const uint8_t TOO_LARGE_1000 = 1<<6;
-                                                // 11110101 1000____
-                                                // 1111011_ 1000____
-                                                // 11111___ 1000____
-    constexpr const uint8_t OVERLONG_4  = 1<<6; // 11110000 1000____
+    constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+    // 11110101 1000____
+    // 1111011_ 1000____
+    // 11111___ 1000____
+    constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
     const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4
-    );
+        // 0_______ ________ <ASCII in byte 1>
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+        // 10______ ________ <continuation in byte 1>
+        TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+        // 1100____ ________ <two byte lead in byte 1>
+        TOO_SHORT | OVERLONG_2,
+        // 1101____ ________ <two byte lead in byte 1>
+        TOO_SHORT,
+        // 1110____ ________ <three byte lead in byte 1>
+        TOO_SHORT | OVERLONG_3 | SURROGATE,
+        // 1111____ ________ <four+ byte lead in byte 1>
+        TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
     constexpr const uint8_t CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
     const simd8<uint8_t> byte_1_low = (prev1 & 0x0F).lookup_16<uint8_t>(
-      // ____0000 ________
-      CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-      // ____0001 ________
-      CARRY | OVERLONG_2,
-      // ____001_ ________
-      CARRY,
-      CARRY,
-
-      // ____0100 ________
-      CARRY | TOO_LARGE,
-      // ____0101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____011_ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-
-      // ____1___ ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      // ____1101 ________
-      CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-      CARRY | TOO_LARGE | TOO_LARGE_1000,
-      CARRY | TOO_LARGE | TOO_LARGE_1000
-    );
+        // ____0000 ________
+        CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+        // ____0001 ________
+        CARRY | OVERLONG_2,
+        // ____001_ ________
+        CARRY, CARRY,
+
+        // ____0100 ________
+        CARRY | TOO_LARGE,
+        // ____0101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____011_ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+        // ____1___ ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000,
+        // ____1101 ________
+        CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, CARRY | TOO_LARGE | TOO_LARGE_1000, CARRY | TOO_LARGE | TOO_LARGE_1000);
     const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE  | TOO_LARGE,
-
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
-    );
+        // ________ 0_______ <ASCII in byte 2>
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+
+        // ________ 1000____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
+        // ________ 1001____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+        // ________ 101_____
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+        // ________ 11______
+        TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
     return (byte_1_high & byte_1_low & byte_2_high);
-  }
-  simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
-      const simd8<uint8_t> prev_input, const simd8<uint8_t> sc) {
+}
+simdutf_really_inline simd8<uint8_t> check_multibyte_lengths(const simd8<uint8_t> input,
+    const simd8<uint8_t> prev_input, const simd8<uint8_t> sc)
+{
     simd8<uint8_t> prev2 = input.prev<2>(prev_input);
     simd8<uint8_t> prev3 = input.prev<3>(prev_input);
     simd8<uint8_t> must23 = simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
     simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
     return must23_80 ^ sc;
-  }
-
+}
 
-  struct validating_transcoder {
+struct validating_transcoder {
     // If this is nonzero, there has been a UTF-8 error.
     simd8<uint8_t> error;
 
-    validating_transcoder() : error(uint8_t(0)) {}
+    validating_transcoder()
+        : error(uint8_t(0))
+    {
+    }
     //
     // Check whether the current bytes are valid UTF-8.
     //
-    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input) {
-      // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
-      // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
-      simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-      simd8<uint8_t> sc = check_special_cases(input, prev1);
-      this->error |= check_multibyte_lengths(input, prev_input, sc);
-    }
-
-
-
-    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) { return 0; }
-      if(pos < size) {
-        size_t howmany  = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-        if(howmany == 0) { return 0; }
-        utf32_output += howmany;
-      }
-      return utf32_output - start;
-    }
-
-    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output) {
-      size_t pos = 0;
-      char32_t* start{utf32_output};
-      const size_t safety_margin = 16; // to avoid overruns!
-      while(pos + 64 + safety_margin <= size) {
-        simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-        if(input.is_ascii()) {
-          input.store_ascii_as_utf32(utf32_output);
-          utf32_output += 64;
-          pos += 64;
-        } else {
-          // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
-          static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-              "We support either two or four chunks per 64-byte block.");
-          auto zero = simd8<uint8_t>{uint8_t(0)};
-          if(simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          } else if(simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-            this->check_utf8_bytes(input.chunks[0], zero);
-            this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-            this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-            this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-          }
-          if (errors()) {
+    simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input, const simd8<uint8_t> prev_input)
+    {
+        // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+ lead bytes
+        // (2, 3, 4-byte leads become large positive numbers instead of small negative numbers)
+        simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+        simd8<uint8_t> sc = check_special_cases(input, prev1);
+        this->error |= check_multibyte_lengths(input, prev_input, sc);
+    }
+
+    simdutf_really_inline size_t convert(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
+            return 0;
+        }
+        if (pos < size) {
+            size_t howmany = scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+            if (howmany == 0) {
+                return 0;
+            }
+            utf32_output += howmany;
+        }
+        return utf32_output - start;
+    }
+
+    simdutf_really_inline result convert_with_errors(const char* in, size_t size, char32_t* utf32_output)
+    {
+        size_t pos = 0;
+        char32_t* start { utf32_output };
+        // In the worst case, we have the haswell kernel which can cause an overflow of
+        // 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
+        // and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
+        // much more than 8 bytes. However, you cannot generally assume that you have valid
+        // UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
+        // to give us a good margin.
+        size_t leading_byte = 0;
+        size_t margin = size;
+        for (; margin > 0 && leading_byte < 4; margin--) {
+            leading_byte += (int8_t(in[margin - 1]) > -65);
+        }
+        // If the input is long enough, then we have that margin-1 is the fourth last leading byte.
+        const size_t safety_margin = size - margin + 1; // to avoid overruns!
+        while (pos + 64 + safety_margin <= size) {
+            simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+            if (input.is_ascii()) {
+                input.store_ascii_as_utf32(utf32_output);
+                utf32_output += 64;
+                pos += 64;
+            } else {
+                // you might think that a for-loop would work, but under Visual Studio, it is not good enough.
+                static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) || (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+                auto zero = simd8<uint8_t> { uint8_t(0) };
+                if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+                    this->check_utf8_bytes(input.chunks[0], zero);
+                    this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+                    this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+                    this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+                }
+                if (errors()) {
+                    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+                    res.count += pos;
+                    return res;
+                }
+                uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+                uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+                uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+                // We process in blocks of up to 12 bytes except possibly
+                // for fast paths which may process up to 16 bytes. For the
+                // slow path to work, we should have at least 12 input bytes left.
+                size_t max_starting_point = (pos + 64) - 12;
+                // Next loop is going to run at least five times.
+                while (pos < max_starting_point) {
+                    // Performance note: our ability to compute 'consumed' and
+                    // then shift and recompute is critical. If there is a
+                    // latency of, say, 4 cycles on getting 'consumed', then
+                    // the inner loop might have a total latency of about 6 cycles.
+                    // Yet we process between 6 to 12 inputs bytes, thus we get
+                    // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+                    // for this section of the code. Hence, there is a limit
+                    // to how much we can further increase this latency before
+                    // it seriously harms performance.
+                    size_t consumed = convert_masked_utf8_to_utf32(in + pos,
+                        utf8_end_of_code_point_mask, utf32_output);
+                    pos += consumed;
+                    utf8_end_of_code_point_mask >>= consumed;
+                }
+                // At this point there may remain between 0 and 12 bytes in the
+                // 64-byte block. These bytes will be processed again. So we have an
+                // 80% efficiency (in the worst case). In practice we expect an
+                // 85% to 90% efficiency.
+            }
+        }
+        if (errors()) {
             result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
             res.count += pos;
             return res;
-          }
-          uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-          uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-          uint64_t utf8_end_of_code_point_mask = utf8_leading_mask>>1;
-          // We process in blocks of up to 12 bytes except possibly
-          // for fast paths which may process up to 16 bytes. For the
-          // slow path to work, we should have at least 12 input bytes left.
-          size_t max_starting_point = (pos + 64) - 12;
-          // Next loop is going to run at least five times.
-          while(pos < max_starting_point) {
-            // Performance note: our ability to compute 'consumed' and
-            // then shift and recompute is critical. If there is a
-            // latency of, say, 4 cycles on getting 'consumed', then
-            // the inner loop might have a total latency of about 6 cycles.
-            // Yet we process between 6 to 12 inputs bytes, thus we get
-            // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-            // for this section of the code. Hence, there is a limit
-            // to how much we can further increase this latency before
-            // it seriously harms performance.
-            size_t consumed = convert_masked_utf8_to_utf32(in + pos,
-                            utf8_end_of_code_point_mask, utf32_output);
-            pos += consumed;
-            utf8_end_of_code_point_mask >>= consumed;
-          }
-          // At this point there may remain between 0 and 12 bytes in the
-          // 64-byte block.These bytes will be processed again. So we have an
-          // 80% efficiency (in the worst case). In practice we expect an
-          // 85% to 90% efficiency.
-        }
-      }
-      if(errors()) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        res.count += pos;
-        return res;
-      }
-      if(pos < size) {
-        result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
-        if (res.error) {    // In case of error, we want the error position
-          res.count += pos;
-          return res;
-        } else {    // In case of success, we want the number of word written
-          utf32_output += res.count;
         }
-      }
-      return result(error_code::SUCCESS, utf32_output - start);
+        if (pos < size) {
+            result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, in + pos, size - pos, utf32_output);
+            if (res.error) { // In case of error, we want the error position
+                res.count += pos;
+                return res;
+            } else { // In case of success, we want the number of word written
+                utf32_output += res.count;
+            }
+        }
+        return result(error_code::SUCCESS, utf32_output - start);
     }
 
-    simdutf_really_inline bool errors() const {
-      return this->error.any_bits_set_anywhere();
+    simdutf_really_inline bool errors() const
+    {
+        return this->error.any_bits_set_anywhere();
     }
 
-  }; // struct utf8_checker
+}; // struct utf8_checker
 } // utf8_to_utf32 namespace
 } // unnamed namespace
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf8.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf8.h
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
@@ -27771,36 +31282,37 @@ namespace utf8 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char* in, size_t size) {
+simdutf_really_inline size_t count_code_points(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      count += 64 - count_ones(utf8_continuation_mask);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        count += 64 - count_ones(utf8_continuation_mask);
     }
     return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf16_length_from_utf8(const char* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 64 <= size; pos += 64) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-      // We count one word for anything that is not a continuation (so
-      // leading bytes).
-      count += 64 - count_ones(utf8_continuation_mask);
-      int64_t utf8_4byte = input.gteq_unsigned(240);
-      count += count_ones(utf8_4byte);
+    for (; pos + 64 <= size; pos += 64) {
+        simd8x64<int8_t> input(reinterpret_cast<const int8_t*>(in + pos));
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        // We count one word for anything that is not a continuation (so
+        // leading bytes).
+        count += 64 - count_ones(utf8_continuation_mask);
+        int64_t utf8_4byte = input.gteq_unsigned(240);
+        count += count_ones(utf8_4byte);
     }
     return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
 
-
-simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size) {
+simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
+{
     return count_code_points(in, size);
 }
 } // utf8 namespace
@@ -27808,64 +31320,72 @@ simdutf_really_inline size_t utf32_length_from_utf8(const char* in, size_t size)
 } // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=generic/utf16.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=generic/utf16.h
 /* begin file src/generic/utf16.h */
 namespace simdutf {
 namespace westmere {
 namespace {
 namespace utf16 {
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-      count += count_ones(not_pair) / 2;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+        count += count_ones(not_pair) / 2;
     }
     return count + scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t* in, size_t size)
+{
     size_t pos = 0;
     size_t count = 0;
     // This algorithm could no doubt be improved!
-    for(;pos + 32 <= size; pos += 32) {
-      simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-      if (!match_system(big_endian)) input.swap_bytes();
-      uint64_t ascii_mask = input.lteq(0x7F);
-      uint64_t twobyte_mask = input.lteq(0x7FF);
-      uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-      size_t ascii_count = count_ones(ascii_mask) / 2;
-      size_t twobyte_count = count_ones(twobyte_mask & ~ ascii_mask) / 2;
-      size_t threebyte_count = count_ones(not_pair_mask & ~ twobyte_mask) / 2;
-      size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-      count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
+    for (; pos + 32 <= size; pos += 32) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        if (!match_system(big_endian)) {
+            input.swap_bytes();
+        }
+        uint64_t ascii_mask = input.lteq(0x7F);
+        uint64_t twobyte_mask = input.lteq(0x7FF);
+        uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+        size_t ascii_count = count_ones(ascii_mask) / 2;
+        size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+        size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+        size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+        count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count + ascii_count;
     }
     return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos, size - pos);
 }
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size) {
+template<endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t* in, size_t size)
+{
     return count_code_points<big_endian>(in, size);
 }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output) {
-  size_t pos = 0;
+simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t size, char16_t* output)
+{
+    size_t pos = 0;
 
-  while (pos + 32 <= size) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+    while (pos + 32 <= size) {
+        simd16x32<uint16_t> input(reinterpret_cast<const uint16_t*>(in + pos));
+        input.swap_bytes();
+        input.store(reinterpret_cast<uint16_t*>(output));
+        pos += 32;
+        output += 32;
+    }
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+    scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
 } // utf16
@@ -27880,467 +31400,667 @@ simdutf_really_inline void change_endianness_utf16(const char16_t* in, size_t si
 namespace simdutf {
 namespace westmere {
 
-simdutf_warn_unused int implementation::detect_encodings(const char * input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if(bom_encoding != encoding_type::unspecified) { return bom_encoding; }
-  if (length % 2 == 0) {
-    return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
-  } else {
-    if (implementation::validate_utf8(input, length)) {
-      return simdutf::encoding_type::UTF8;
+simdutf_warn_unused int implementation::detect_encodings(const char* input, size_t length) const noexcept
+{
+    // If there is a BOM, then we trust it.
+    auto bom_encoding = simdutf::BOM::check_bom(input, length);
+    if (bom_encoding != encoding_type::unspecified) {
+        return bom_encoding;
+    }
+    if (length % 2 == 0) {
+        return sse_detect_encodings<utf8_validation::utf8_checker>(input, length);
     } else {
-      return simdutf::encoding_type::unspecified;
+        if (implementation::validate_utf8(input, length)) {
+            return simdutf::encoding_type::UTF8;
+        } else {
+            return simdutf::encoding_type::unspecified;
+        }
     }
-  }
 }
 
-simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8(buf, len);
+simdutf_warn_unused bool implementation::validate_utf8(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii(buf, len);
+simdutf_warn_unused bool implementation::validate_ascii(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,len);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(const char* buf, size_t len) const noexcept
+{
+    return westmere::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16le(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::LITTLE>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, size_t len) const noexcept {
-  const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf16be(const char16_t* buf, size_t len) const noexcept
+{
+    const char16_t* tail = sse_validate_utf16<endianness::BIG>(buf, len);
+    if (tail) {
+        return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t* buf, size_t len) const noexcept
+{
+    result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
 }
 
-simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* tail = sse_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
+simdutf_warn_unused bool implementation::validate_utf32(const char32_t* buf, size_t len) const noexcept
+{
+    const char32_t* tail = sse_validate_utf32le(buf, len);
+    if (tail) {
+        return scalar::utf32::validate(tail, len - (tail - buf));
+    } else {
+        return false;
+    }
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
+simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t* buf, size_t len) const noexcept
+{
+    result res = sse_validate_utf32le_with_errors(buf, len);
+    if (res.count != len) {
+        result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+        return result(scalar_res.error, res.count + scalar_res.count);
+    } else {
+        return res;
+    }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(const char* buf, size_t len, char* utf8_output) const noexcept
+{
+    return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(const char* buf, size_t len, char32_t* latin1_output) const noexcept
+{
+    return scalar::latin1_to_utf32::convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
 }
 
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(const char* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    utf8_to_utf16::validating_transcoder converter;
+    return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size, utf16_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(const char* input, size_t size,
-    char16_t* utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,  utf16_output);
+    char16_t* utf16_output) const noexcept
+{
+    return utf8_to_utf16::convert_valid<endianness::BIG>(input, size, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    utf8_to_utf32::validating_transcoder converter;
+    return converter.convert_with_errors(buf, len, utf32_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(const char* input, size_t size,
-    char32_t* utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size,  utf32_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    char32_t* utf32_output) const noexcept
+{
+    return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(const char16_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
     }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char16_t*, char*> ret = sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len, utf8_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const char16_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf8_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first.error) { return ret.first; }  // Can return directly since scalar fallback already found correct ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf32_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) { return 0; }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
-                                        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) { return 0; }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of words written even if finished
-  std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-                                        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) const noexcept
+{
+    return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    std::pair<const char32_t*, char*> ret = sse_convert_utf32_to_utf8(buf, len, utf8_output);
+    if (ret.first == nullptr) {
+        return 0;
     }
-  }
-  ret.first.count = ret.second - utf16_output;   // Set count to the number of 8-bit words written
-  return ret.first;
+    size_t saved_bytes = ret.second - utf8_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char*> ret = westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf8_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    std::pair<const char16_t*, char32_t*> ret = sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf32_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char32_t*> ret = westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len, utf32_output);
+    if (ret.first.error) {
+        return ret.first;
+    } // Can return directly since scalar fallback already found correct ret.first.count
+    if (ret.first.count != len) { // All good so far, but not finished
+        result scalar_res = scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf32_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-void implementation::change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept
+{
+    return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    std::pair<const char32_t*, char16_t*> ret = sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first == nullptr) {
+        return 0;
+    }
+    size_t saved_bytes = ret.second - utf16_output;
+    if (ret.first != buf + len) {
+        const size_t scalar_saved_bytes = scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+        if (scalar_saved_bytes == 0) {
+            return 0;
+        }
+        saved_bytes += scalar_saved_bytes;
+    }
+    return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::count_utf8(const char * input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    // ret.first.count is always the position in the buffer, not the number of words written even if finished
+    std::pair<result, char16_t*> ret = westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len, utf16_output);
+    if (ret.first.count != len) {
+        result scalar_res = scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+        if (scalar_res.error) {
+            scalar_res.count += ret.first.count;
+            return scalar_res;
+        } else {
+            ret.second += scalar_res.count;
+        }
+    }
+    ret.first.count = ret.second - utf16_output; // Set count to the number of 8-bit words written
+    return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16le(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(const char32_t* buf, size_t len, char16_t* utf16_output) const noexcept
+{
+    return convert_utf32_to_utf16be(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16le_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(const char16_t* buf, size_t len, char32_t* utf32_output) const noexcept
+{
+    return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+void implementation::change_endianness_utf16(const char16_t* input, size_t length, char16_t* output) const noexcept
+{
+    utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
-  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
-    const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
-    const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
-    const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
-    const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
-    const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
+simdutf_warn_unused size_t implementation::count_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::count_code_points<endianness::BIG>(input, length);
+}
 
-    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 16 - 3*ascii_count - 2*two_bytes_count - three_bytes_count;
-  }
-  return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::count_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  size_t pos = 0;
-  size_t count = 0;
-  for(;pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
-    const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (16-count_ones(surrogate_bitmask))/4;
-    count += 4 + surrogate_count;
-  }
-  return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(const char* buf, size_t len) const noexcept
+{
+    return scalar::utf8::latin1_length_from_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf16(size_t length) const noexcept
+{
+    return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t length) const noexcept
+{
+    return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_latin1(size_t length) const noexcept
+{
+    return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(const char* input, size_t length) const noexcept
+{
+    return scalar::latin1::utf8_length_from_latin1(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(const char16_t* input, size_t length) const noexcept
+{
+    return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m128i v_00000000 = _mm_setzero_si128();
+    const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
+    const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
+        const __m128i ascii_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
+        const __m128i one_two_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
+        const __m128i two_bytes_bytemask = _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
+        const __m128i one_two_three_bytes_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+        const __m128i three_bytes_bytemask = _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+        const uint16_t ascii_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
+        const uint16_t two_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
+        const uint16_t three_bytes_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
+
+        size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+        size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+        size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+        count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    }
+    return count + scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t* input, size_t length) const noexcept
+{
+    const __m128i v_00000000 = _mm_setzero_si128();
+    const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+    size_t pos = 0;
+    size_t count = 0;
+    for (; pos + 4 <= length; pos += 4) {
+        __m128i in = _mm_loadu_si128((__m128i*)(input + pos));
+        const __m128i surrogate_bytemask = _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+        const uint16_t surrogate_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
+        size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+        count += 4 + surrogate_count;
+    }
+    return count + scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char * input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(const char* input, size_t length) const noexcept
+{
+    return scalar::utf8::count_code_points(input, length);
 }
 
 } // namespace westmere
 } // namespace simdutf
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/src, filename=simdutf/westmere/end.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/src, filename=simdutf/westmere/end.h
 /* begin file src/simdutf/westmere/end.h */
 #if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
diff --git a/src/bun.js/bindings/simdutf.h b/src/bun.js/bindings/simdutf.h
index 0a57a69f7..7fb388e9e 100644
--- a/src/bun.js/bindings/simdutf.h
+++ b/src/bun.js/bindings/simdutf.h
@@ -1,11 +1,11 @@
-/* auto-generated on 2023-02-10 14:42:58 -0500. Do not edit! */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf.h
+/* auto-generated on 2023-06-21 08:09:45 -0400. Do not edit! */
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf.h
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
 #include <cstring>
 
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/compiler_check.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/compiler_check.h
 /* begin file include/simdutf/compiler_check.h */
 #ifndef SIMDUTF_COMPILER_CHECK_H
 #define SIMDUTF_COMPILER_CHECK_H
@@ -43,13 +43,13 @@
 
 #endif // SIMDUTF_COMPILER_CHECK_H
 /* end file include/simdutf/compiler_check.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/common_defs.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/common_defs.h
 /* begin file include/simdutf/common_defs.h */
 #ifndef SIMDUTF_COMMON_DEFS_H
 #define SIMDUTF_COMMON_DEFS_H
 
 #include <cassert>
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/portability.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/portability.h
 /* begin file include/simdutf/portability.h */
 #ifndef SIMDUTF_PORTABILITY_H
 #define SIMDUTF_PORTABILITY_H
@@ -144,6 +144,8 @@
 // POWER processors. Please see https://github.com/lemire/simdutf/issues/51
 #elif defined(__s390__)
 // s390 IBM system. Big endian.
+#elif (defined(__riscv) || defined(__riscv__)) && __riscv_xlen == 64
+// RISC-V 64-bit
 #else
 // The simdutf library is designed
 // for 64-bit processors and it seems that you are not
@@ -278,7 +280,7 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif // SIMDUTF_PORTABILITY_H
 /* end file include/simdutf/portability.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/avx512.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/avx512.h
 /* begin file include/simdutf/avx512.h */
 #ifndef SIMDUTF_AVX512_H_
 #define SIMDUTF_AVX512_H_
@@ -458,19 +460,21 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif // MSC_VER
 
-#if defined(SIMDUTF_VISUAL_STUDIO)
-    /**
-     * It does not matter here whether you are using
-     * the regular visual studio or clang under visual
-     * studio.
-     */
-    #if SIMDUTF_USING_LIBRARY
-    #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+#ifndef SIMDUTF_DLLIMPORTEXPORT
+    #if defined(SIMDUTF_VISUAL_STUDIO)
+      /**
+       * It does not matter here whether you are using
+       * the regular visual studio or clang under visual
+       * studio.
+       */
+      #if SIMDUTF_USING_LIBRARY
+      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllimport)
+      #else
+      #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+      #endif
     #else
-    #define SIMDUTF_DLLIMPORTEXPORT __declspec(dllexport)
+      #define SIMDUTF_DLLIMPORTEXPORT
     #endif
-#else
-    #define SIMDUTF_DLLIMPORTEXPORT
 #endif
 
 /// If EXPR is an error, returns it.
@@ -479,7 +483,7 @@ use a 64-bit target such as x64, 64-bit ARM or 64-bit PPC.")
 
 #endif // SIMDUTF_COMMON_DEFS_H
 /* end file include/simdutf/common_defs.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/encoding_types.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/encoding_types.h
 /* begin file include/simdutf/encoding_types.h */
 #include <string>
 
@@ -491,6 +495,7 @@ enum encoding_type {
         UTF16_BE = 4,   // BOM 0xfe 0xff
         UTF32_LE = 8,   // BOM 0xff 0xfe 0x00 0x00
         UTF32_BE = 16,   // BOM 0x00 0x00 0xfe 0xff
+        Latin1 = 32,
 
         unspecified = 0
 };
@@ -527,7 +532,7 @@ size_t bom_byte_size(encoding_type bom);
 } // BOM namespace
 } // simdutf namespace
 /* end file include/simdutf/encoding_types.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/error.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/error.h
 /* begin file include/simdutf/error.h */
 #ifndef ERROR_H
 #define ERROR_H
@@ -541,9 +546,10 @@ enum error_code {
   TOO_LONG,     // We either have too many consecutive continuation bytes or the string starts with a continuation byte.
   OVERLONG,     // The decoded character must be above U+7F for two-byte characters, U+7FF for three-byte characters,
                 // and U+FFFF for four-byte characters.
-  TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF OR less than or equal than U+7F for ASCII.
+  TOO_LARGE,    // The decoded character must be less than or equal to U+10FFFF,less than or equal than U+7F for ASCII OR less than equal than U+FF for Latin1
   SURROGATE,    // The decoded character must be not be in U+D800...DFFF (UTF-8 or UTF-32) OR
-                // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16)
+                // a high surrogate must be followed by a low surrogate and a low surrogate must be preceded by a high surrogate (UTF-16) OR
+                // there must be no surrogate at all (Latin1)
   OTHER         // Not related to validation/transcoding.
 };
 
@@ -564,7 +570,7 @@ SIMDUTF_PUSH_DISABLE_WARNINGS
 SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 
 // Public API
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/simdutf_version.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/simdutf_version.h
 /* begin file include/simdutf/simdutf_version.h */
 // /include/simdutf/simdutf_version.h automatically generated by release.py,
 // do not change by hand
@@ -572,7 +578,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "3.2.0"
+#define SIMDUTF_VERSION "3.2.14"
 
 namespace simdutf {
 enum {
@@ -587,13 +593,13 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 0
+  SIMDUTF_VERSION_REVISION = 14
 };
 } // namespace simdutf
 
 #endif // SIMDUTF_SIMDUTF_VERSION_H
 /* end file include/simdutf/simdutf_version.h */
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/implementation.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/implementation.h
 /* begin file include/simdutf/implementation.h */
 #ifndef SIMDUTF_IMPLEMENTATION_H
 #define SIMDUTF_IMPLEMENTATION_H
@@ -603,7 +609,7 @@ enum {
 #endif
 #include <vector>
 #include <tuple>
-// dofile: invoked with prepath=/Users/dlemire/CVS/github/simdutf/include, filename=simdutf/internal/isadetection.h
+// dofile: invoked with prepath=/Users/jarred/Build/simdutf/include, filename=simdutf/internal/isadetection.h
 /* begin file include/simdutf/internal/isadetection.h */
 /* From
 https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
@@ -690,22 +696,12 @@ static inline uint32_t detect_supported_architectures() {
   return instruction_set::ALTIVEC;
 }
 
-#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
-
-#if defined(__ARM_NEON)
+#elif defined(__aarch64__) || defined(_M_ARM64)
 
 static inline uint32_t detect_supported_architectures() {
   return instruction_set::NEON;
 }
 
-#else // ARM without NEON
-
-static inline uint32_t detect_supported_architectures() {
-  return instruction_set::DEFAULT;
-}
-
-#endif
-
 #elif defined(__x86_64__) || defined(_M_AMD64) // x64
 
 
@@ -716,6 +712,7 @@ namespace cpuid_bit {
     // EAX = 0x01
     constexpr uint32_t pclmulqdq = uint32_t(1) << 1; ///< @private bit  1 of ECX for EAX=0x1
     constexpr uint32_t sse42 = uint32_t(1) << 20;    ///< @private bit 20 of ECX for EAX=0x1
+    constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
 
     // EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
     // See: "Table 3-8. Information Returned by CPUID Instruction"
@@ -741,6 +738,10 @@ namespace cpuid_bit {
     namespace edx {
       constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
     }
+    namespace xcr0_bit {
+     constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
+     constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
+   }
   }
 }
 
@@ -750,7 +751,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
                          uint32_t *edx) {
 #if defined(_MSC_VER)
   int cpu_info[4];
-  __cpuid(cpu_info, *eax);
+  __cpuidex(cpu_info, *eax, *ecx);
   *eax = cpu_info[0];
   *ebx = cpu_info[1];
   *ecx = cpu_info[2];
@@ -768,6 +769,16 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
 #endif
 }
 
+static inline uint64_t xgetbv() {
+ #if defined(_MSC_VER)
+   return _xgetbv(0);
+ #else
+   uint32_t xcr0_lo, xcr0_hi;
+   asm volatile("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
+   return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+ #endif
+ }
+
 static inline uint32_t detect_supported_architectures() {
   uint32_t eax;
   uint32_t ebx = 0;
@@ -787,6 +798,16 @@ static inline uint32_t detect_supported_architectures() {
     host_isa |= instruction_set::PCLMULQDQ;
   }
 
+  if ((ecx & cpuid_bit::osxsave) != cpuid_bit::osxsave) {
+    return host_isa;
+  }
+
+  // xgetbv for checking if the OS saves registers
+  uint64_t xcr0 = xgetbv();
+
+  if ((xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == 0) {
+    return host_isa;
+  }
   // ECX for EAX=0x7
   eax = 0x7;
   ecx = 0x0; // Sub-leaf = 0
@@ -800,6 +821,9 @@ static inline uint32_t detect_supported_architectures() {
   if (ebx & cpuid_bit::ebx::bmi2) {
     host_isa |= instruction_set::BMI2;
   }
+  if (!((xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved)) {
+    return host_isa;
+  }
   if (ebx & cpuid_bit::ebx::avx512f) {
     host_isa |= instruction_set::AVX512F;
   }
@@ -822,7 +846,7 @@ static inline uint32_t detect_supported_architectures() {
 }
 #else // fallback
 
-
+// includes 32-bit ARM.
 static inline uint32_t detect_supported_architectures() {
   return instruction_set::DEFAULT;
 }
@@ -870,7 +894,6 @@ simdutf_really_inline simdutf_warn_unused int detect_encodings(const uint8_t * i
   return detect_encodings(reinterpret_cast<const char *>(input), length);
 }
 
-
 /**
  * Validate the UTF-8 string. This function may be best when you expect
  * the input to be almost always valid. Otherwise, consider using
@@ -1034,6 +1057,68 @@ simdutf_warn_unused bool validate_utf32(const char32_t *buf, size_t len) noexcep
  */
 simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf, size_t len) noexcept;
 
+  /**
+   * Convert Latin1 string into UTF8 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) noexcept;
+
+
+    /**
+   * Convert possibly Latin1 string into UTF-16LE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1  string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF-16BE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
+
+  /**
+   * Convert Latin1 string into UTF-32 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
+
+ /**
+   * Convert possibly broken UTF-8 string into latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
+
 /**
  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16 string.
  *
@@ -1073,6 +1158,20 @@ simdutf_warn_unused size_t convert_utf8_to_utf16le(const char * input, size_t le
  */
 simdutf_warn_unused size_t convert_utf8_to_utf16be(const char * input, size_t length, char16_t* utf16_output) noexcept;
 
+
+  /**
+   * Convert possibly broken UTF-8 string into latin1 string. with errors
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) noexcept;
+
 /**
  * Using native endianness; Convert possibly broken UTF-8 string into UTF-16
  * string and stop on error.
@@ -1139,6 +1238,21 @@ simdutf_warn_unused size_t convert_utf8_to_utf32(const char * input, size_t leng
  */
 simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input, size_t length, char32_t* utf32_output) noexcept;
 
+    /**
+   * Convert valid UTF-8 string into latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) noexcept;
+
+
 /**
  * Using native endianness; Convert valid UTF-8 string into UTF-16 string.
  *
@@ -1187,6 +1301,29 @@ simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(const char * input, siz
  */
 simdutf_warn_unused size_t convert_valid_utf8_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) noexcept;
 
+
+ /**
+   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string bytes
+   * @return the number of bytes required to encode the Latin1 string as UTF-8
+   */
+    simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t length) noexcept;
+
+  /**
+   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in byte
+   * @return the number of bytes required to encode the UTF-8 string as Latin1
+   */
+    simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t length) noexcept;
+
 /**
  * Compute the number of 2-byte words that this UTF-8 string would require in UTF-16LE format.
  *
@@ -1230,6 +1367,38 @@ simdutf_warn_unused size_t utf32_length_from_utf8(const char * input, size_t len
  */
 simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
 /**
  * Convert possibly broken UTF-16LE string into UTF-8 string.
  *
@@ -1260,6 +1429,35 @@ simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t * input, size_
  */
 simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
 /**
  * Using native endianness; Convert possibly broken UTF-16 string into UTF-8 string and stop on error.
  *
@@ -1319,6 +1517,36 @@ simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(const char16_t *
  */
 simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, size_t length, char* utf8_buffer) noexcept;
 
+
+  /**
+   * Convert valid UTF-16LE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert valid UTF-16BE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
 /**
  * Convert valid UTF-16LE string into UTF-8 string.
  *
@@ -1480,6 +1708,21 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(const char16_t * input
  */
 simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input, size_t length, char32_t* utf32_buffer) noexcept;
 
+
+/*
+   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as Latin1
+   */
+  simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
+
+
 /**
  * Using native endianness; Compute the number of bytes that this UTF-16
  * string would require in UTF-8 format.
@@ -1588,6 +1831,53 @@ simdutf_warn_unused size_t convert_utf32_to_utf16(const char32_t * input, size_t
  */
 simdutf_warn_unused size_t convert_utf32_to_utf16le(const char32_t * input, size_t length, char16_t* utf16_buffer) noexcept;
 
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+
+  simdutf_warn_unused size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
+  /**
+   * Convert valid UTF-32 string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) noexcept;
+
 /**
  * Convert possibly broken UTF-32 string into UTF-16BE string.
  *
@@ -2021,6 +2311,96 @@ public:
   simdutf_warn_unused virtual result validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept = 0;
 
   /**
+   * Convert Latin1 string into UTF8 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf8(const char * input, size_t length, char* utf8_output) const noexcept = 0;
+
+
+    /**
+   * Convert possibly Latin1 string into UTF-16LE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1  string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf16le(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF-16BE string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf16_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char16_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf16be(const char * input, size_t length, char16_t* utf16_output) const noexcept = 0;
+
+  /**
+   * Convert Latin1 string into UTF-32 string.
+   *
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string in bytes
+   * @param utf32_buffer  the pointer to buffer that can hold conversion result
+   * @return the number of written char32_t; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_latin1_to_utf32(const char * input, size_t length, char32_t* utf32_buffer) const noexcept = 0;
+
+ /**
+   * Convert possibly broken UTF-8 string into latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-8 string into latin1 string. with errors
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of words validated if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+    /**
+   * Convert valid UTF-8 string into latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in bytes
+   * @param latin1_output  the pointer to buffer that can hold conversion result
+   * @return the number of written char; 0 if the input was not valid UTF-8 string
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf8_to_latin1(const char * input, size_t length, char* latin1_output) const noexcept = 0;
+
+
+  /**
    * Convert possibly broken UTF-8 string into UTF-16LE string.
    *
    * During the conversion also validation of the input string is done.
@@ -2159,6 +2539,92 @@ public:
   simdutf_warn_unused virtual size_t utf32_length_from_utf8(const char * input, size_t length) const noexcept = 0;
 
   /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-16LE string
+   */
+  simdutf_warn_unused virtual size_t convert_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual size_t convert_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16LE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16le_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-16BE string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+  simdutf_warn_unused virtual result convert_utf16be_to_latin1_with_errors(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16LE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16le_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-16BE string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-8.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16BE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf16be_to_latin1(const char16_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
    * Convert possibly broken UTF-16LE string into UTF-8 string.
    *
    * During the conversion also validation of the input string is done.
@@ -2361,6 +2827,52 @@ public:
   simdutf_warn_unused virtual size_t utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept = 0;
 
   /**
+   * Convert possibly broken UTF-32 string into Latin1 string.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return number of written words; 0 if input is not a valid UTF-32 string
+   */
+
+  simdutf_warn_unused virtual size_t convert_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert possibly broken UTF-32 string into Latin1 string and stop on error.
+   *
+   * During the conversion also validation of the input string is done.
+   * This function is suitable to work with inputs from untrusted sources.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold conversion result
+   * @return a result pair struct with an error code and either the position of the error if any or the number of char written if successful.
+   */
+
+  simdutf_warn_unused virtual result convert_utf32_to_latin1_with_errors(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
+   * Convert valid UTF-32 string into Latin1 string.
+   *
+   * This function assumes that the input string is valid UTF-32.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @param latin1_buffer   the pointer to buffer that can hold the conversion result
+   * @return number of written words; 0 if conversion is not possible
+   */
+  simdutf_warn_unused virtual size_t convert_valid_utf32_to_latin1(const char32_t * input, size_t length, char* latin1_buffer) const noexcept = 0;
+
+  /**
    * Convert possibly broken UTF-32 string into UTF-8 string.
    *
    * During the conversion also validation of the input string is done.
@@ -2404,6 +2916,17 @@ public:
    */
   simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
 
+
+    /**
+   * Return the number of bytes that this UTF-16 string would require in Latin1 format.
+   *
+   *
+   * @param input         the UTF-16 string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t utf16_length_from_latin1(size_t length) const noexcept = 0;
+
   /**
    * Convert possibly broken UTF-32 string into UTF-16LE string.
    *
@@ -2506,6 +3029,15 @@ public:
    */
   virtual void change_endianness_utf16(const char16_t * input, size_t length, char16_t * output) const noexcept = 0;
 
+ /**
+   * Return the number of bytes that this Latin1 string would require in UTF-8 format.
+   *
+   * @param input         the Latin1 string to convert
+   * @param length        the length of the string bytes
+   * @return the number of bytes required to encode the Latin1 string as UTF-8
+   */
+    simdutf_warn_unused virtual size_t utf8_length_from_latin1(const char * input, size_t length) const noexcept = 0;
+
   /**
    * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
    *
@@ -2518,6 +3050,41 @@ public:
   simdutf_warn_unused virtual size_t utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
 
   /**
+   * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t latin1_length_from_utf32( size_t length) const noexcept = 0;
+
+  /**
+   * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-8 string to convert
+   * @param length        the length of the string in byte
+   * @return the number of bytes required to encode the UTF-8 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t latin1_length_from_utf8(const char * input, size_t length) const noexcept = 0;
+
+/*
+   * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * This function is not BOM-aware.
+   *
+   * @param input         the UTF-16LE string to convert
+   * @param length        the length of the string in 2-byte words (char16_t)
+   * @return the number of bytes required to encode the UTF-16LE string as Latin1
+   */
+  simdutf_warn_unused virtual size_t latin1_length_from_utf16(size_t length) const noexcept = 0;
+
+  /**
    * Compute the number of two-byte words that this UTF-32 string would require in UTF-16 format.
    *
    * This function does not validate the input.
@@ -2528,6 +3095,18 @@ public:
    */
   simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
 
+
+    /**
+   * Return the number of bytes that this UTF-32 string would require in Latin1 format.
+   *
+   * This function does not validate the input.
+   *
+   * @param input         the UTF-32 string to convert
+   * @param length        the length of the string in 4-byte words (char32_t)
+   * @return the number of bytes required to encode the UTF-32 string as Latin1
+   */
+    simdutf_warn_unused virtual size_t utf32_length_from_latin1(size_t length) const noexcept = 0;
+
   /*
    * Compute the number of bytes that this UTF-16LE string would require in UTF-32 format.
    *
diff --git a/src/bun.js/bindings/sqlite/JSSQLStatement.cpp b/src/bun.js/bindings/sqlite/JSSQLStatement.cpp
index a6855fd19..61ac91ba7 100644
--- a/src/bun.js/bindings/sqlite/JSSQLStatement.cpp
+++ b/src/bun.js/bindings/sqlite/JSSQLStatement.cpp
@@ -107,6 +107,50 @@ static JSC_DECLARE_HOST_FUNCTION(jsSQLStatementDeserialize);
         return JSValue::encode(jsUndefined());                                                                     \
     }
 
+class VersionSqlite3 {
+public:
+    explicit VersionSqlite3(sqlite3* db)
+        : db(db)
+        , version(0)
+    {
+    }
+    sqlite3* db;
+    std::atomic<uint64_t> version;
+};
+
+class SQLiteSingleton {
+public:
+    Vector<VersionSqlite3*> databases;
+    Vector<std::atomic<uint64_t>> schema_versions;
+};
+
+static SQLiteSingleton* _instance = nullptr;
+
+static Vector<VersionSqlite3*>& databases()
+{
+    if (!_instance) {
+        _instance = new SQLiteSingleton();
+        _instance->databases = Vector<VersionSqlite3*>();
+        _instance->databases.reserveInitialCapacity(4);
+        _instance->schema_versions = Vector<std::atomic<uint64_t>>();
+    }
+
+    return _instance->databases;
+}
+
+extern "C" void Bun__closeAllSQLiteDatabasesForTermination()
+{
+    if (!_instance) {
+        return;
+    }
+    auto& dbs = _instance->databases;
+
+    for (auto& db : dbs) {
+        if (db->db)
+            sqlite3_close_v2(db->db);
+    }
+}
+
 namespace WebCore {
 using namespace JSC;
 
@@ -272,10 +316,6 @@ void JSSQLStatement::destroy(JSC::JSCell* cell)
 
 void JSSQLStatementConstructor::destroy(JSC::JSCell* cell)
 {
-    JSSQLStatementConstructor* thisObject = static_cast<JSSQLStatementConstructor*>(cell);
-    for (auto version_db : thisObject->databases) {
-        delete version_db;
-    }
 }
 
 static inline bool rebindValue(JSC::JSGlobalObject* lexicalGlobalObject, sqlite3_stmt* stmt, int i, JSC::JSValue value, JSC::ThrowScope& scope, bool clone)
@@ -547,8 +587,8 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementDeserialize, (JSC::JSGlobalObject * lexic
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    auto count = thisObject->databases.size();
-    thisObject->databases.append(new VersionSqlite3(db));
+    auto count = databases().size();
+    databases().append(new VersionSqlite3(db));
     RELEASE_AND_RETURN(scope, JSValue::encode(jsNumber(count)));
 }
 
@@ -565,12 +605,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementSerialize, (JSC::JSGlobalObject * lexical
     }
 
     int32_t dbIndex = callFrame->argument(0).toInt32(lexicalGlobalObject);
-    if (UNLIKELY(dbIndex < 0 || dbIndex >= thisObject->databases.size())) {
+    if (UNLIKELY(dbIndex < 0 || dbIndex >= databases().size())) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    sqlite3* db = thisObject->databases[dbIndex]->db;
+    sqlite3* db = databases()[dbIndex]->db;
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Can't do this on a closed database"_s));
         return JSValue::encode(JSC::jsUndefined());
@@ -606,7 +646,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementLoadExtensionFunction, (JSC::JSGlobalObje
     }
 
     int32_t dbIndex = callFrame->argument(0).toInt32(lexicalGlobalObject);
-    if (UNLIKELY(dbIndex < 0 || dbIndex >= thisObject->databases.size())) {
+    if (UNLIKELY(dbIndex < 0 || dbIndex >= databases().size())) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
@@ -620,7 +660,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementLoadExtensionFunction, (JSC::JSGlobalObje
     auto extensionString = extension.toWTFString(lexicalGlobalObject);
     RETURN_IF_EXCEPTION(scope, {});
 
-    sqlite3* db = thisObject->databases[dbIndex]->db;
+    sqlite3* db = databases()[dbIndex]->db;
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Can't do this on a closed database"_s));
         return JSValue::encode(JSC::jsUndefined());
@@ -661,11 +701,11 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementExecuteFunction, (JSC::JSGlobalObject * l
     }
 
     int32_t handle = callFrame->argument(0).toInt32(lexicalGlobalObject);
-    if (thisObject->databases.size() < handle) {
+    if (databases().size() < handle) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
-    sqlite3* db = thisObject->databases[handle]->db;
+    sqlite3* db = databases()[handle]->db;
 
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Database has closed"_s));
@@ -724,7 +764,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementExecuteFunction, (JSC::JSGlobalObject * l
 
     rc = sqlite3_step(statement);
     if (!sqlite3_stmt_readonly(statement)) {
-        thisObject->databases[handle]->version++;
+        databases()[handle]->version++;
     }
 
     while (rc == SQLITE_ROW) {
@@ -765,12 +805,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementIsInTransactionFunction, (JSC::JSGlobalOb
 
     int32_t handle = dbNumber.toInt32(lexicalGlobalObject);
 
-    if (handle < 0 || handle > thisObject->databases.size()) {
+    if (handle < 0 || handle > databases().size()) {
         throwException(lexicalGlobalObject, scope, createRangeError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    sqlite3* db = thisObject->databases[handle]->db;
+    sqlite3* db = databases()[handle]->db;
 
     if (UNLIKELY(!db)) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Database has closed"_s));
@@ -803,12 +843,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementPrepareStatementFunction, (JSC::JSGlobalO
     }
 
     int32_t handle = dbNumber.toInt32(lexicalGlobalObject);
-    if (handle < 0 || handle > thisObject->databases.size()) {
+    if (handle < 0 || handle > databases().size()) {
         throwException(lexicalGlobalObject, scope, createRangeError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(JSC::jsUndefined());
     }
 
-    sqlite3* db = thisObject->databases[handle]->db;
+    sqlite3* db = databases()[handle]->db;
     if (!db) {
         throwException(lexicalGlobalObject, scope, createRangeError(lexicalGlobalObject, "Cannot use a closed database"_s));
         return JSValue::encode(JSC::jsUndefined());
@@ -848,7 +888,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementPrepareStatementFunction, (JSC::JSGlobalO
     auto* structure = JSSQLStatement::createStructure(vm, lexicalGlobalObject, lexicalGlobalObject->objectPrototype());
     // auto* structure = JSSQLStatement::createStructure(vm, globalObject(), thisObject->getDirect(vm, vm.propertyNames->prototype));
     JSSQLStatement* sqlStatement = JSSQLStatement::create(
-        structure, reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject), statement, thisObject->databases[handle]);
+        structure, reinterpret_cast<Zig::GlobalObject*>(lexicalGlobalObject), statement, databases()[handle]);
     if (bindings.isObject()) {
         auto* castedThis = sqlStatement;
         DO_REBIND(bindings)
@@ -924,8 +964,8 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementOpenStatementFunction, (JSC::JSGlobalObje
     status = sqlite3_db_config(db, SQLITE_DBCONFIG_DEFENSIVE, 1, NULL);
     assert(status == SQLITE_OK);
 
-    auto count = constructor->databases.size();
-    constructor->databases.append(new VersionSqlite3(db));
+    auto count = databases().size();
+    databases().append(new VersionSqlite3(db));
     RELEASE_AND_RETURN(scope, JSValue::encode(jsNumber(count)));
 }
 
@@ -956,12 +996,12 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementCloseStatementFunction, (JSC::JSGlobalObj
 
     int dbIndex = dbNumber.toInt32(lexicalGlobalObject);
 
-    if (dbIndex < 0 || dbIndex >= constructor->databases.size()) {
+    if (dbIndex < 0 || dbIndex >= databases().size()) {
         throwException(lexicalGlobalObject, scope, createError(lexicalGlobalObject, "Invalid database handle"_s));
         return JSValue::encode(jsUndefined());
     }
 
-    sqlite3* db = constructor->databases[dbIndex]->db;
+    sqlite3* db = databases()[dbIndex]->db;
     // no-op if already closed
     if (!db) {
         return JSValue::encode(jsUndefined());
@@ -973,7 +1013,7 @@ JSC_DEFINE_HOST_FUNCTION(jsSQLStatementCloseStatementFunction, (JSC::JSGlobalObj
         return JSValue::encode(jsUndefined());
     }
 
-    constructor->databases[dbIndex]->db = nullptr;
+    databases()[dbIndex]->db = nullptr;
     return JSValue::encode(jsUndefined());
 }
 
diff --git a/src/bun.js/bindings/sqlite/JSSQLStatement.h b/src/bun.js/bindings/sqlite/JSSQLStatement.h
index e63b99fbb..8566fcdd9 100644
--- a/src/bun.js/bindings/sqlite/JSSQLStatement.h
+++ b/src/bun.js/bindings/sqlite/JSSQLStatement.h
@@ -47,17 +47,6 @@
 
 namespace WebCore {
 
-class VersionSqlite3 {
-public:
-    explicit VersionSqlite3(sqlite3* db)
-        : db(db)
-        , version(0)
-    {
-    }
-    sqlite3* db;
-    std::atomic<uint64_t> version;
-};
-
 class JSSQLStatementConstructor final : public JSC::JSFunction {
 public:
     using Base = JSC::JSFunction;
@@ -82,13 +71,9 @@ public:
         return JSC::Structure::create(vm, globalObject, prototype, JSC::TypeInfo(JSC::ObjectType, StructureFlags), info());
     }
 
-    Vector<VersionSqlite3*> databases;
-    Vector<std::atomic<uint64_t>> schema_versions;
-
 private:
     JSSQLStatementConstructor(JSC::VM& vm, NativeExecutable* native, JSGlobalObject* globalObject, JSC::Structure* structure)
         : Base(vm, native, globalObject, structure)
-        , databases()
     {
     }
 
diff --git a/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h b/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h
index 3997c1d88..82a2c6a24 100644
--- a/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h
+++ b/src/bun.js/bindings/webcore/DOMClientIsoSubspaces.h
@@ -29,6 +29,7 @@ public:
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForReadableState;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForPendingVirtualModuleResult;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForCallSite;
+    std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForImportMeta;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForNapiExternal;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForRequireResolveFunction;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForBundlerPlugin;
@@ -37,6 +38,7 @@ public:
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForJSMockImplementation;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForJSMockFunction;
     std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForMockWithImplementationCleanupData;
+    std::unique_ptr<GCClient::IsoSubspace> m_clientSubspaceForProcessObject;
 
 #include "ZigGeneratedClasses+DOMClientIsoSubspaces.h"
     /* --- bun --- */
diff --git a/src/bun.js/bindings/webcore/DOMIsoSubspaces.h b/src/bun.js/bindings/webcore/DOMIsoSubspaces.h
index 4feca1754..f1b290d25 100644
--- a/src/bun.js/bindings/webcore/DOMIsoSubspaces.h
+++ b/src/bun.js/bindings/webcore/DOMIsoSubspaces.h
@@ -30,6 +30,7 @@ public:
     std::unique_ptr<IsoSubspace> m_subspaceForPendingVirtualModuleResult;
     std::unique_ptr<IsoSubspace> m_subspaceForCallSite;
     std::unique_ptr<IsoSubspace> m_subspaceForNapiExternal;
+    std::unique_ptr<IsoSubspace> m_subspaceForImportMeta;
     std::unique_ptr<IsoSubspace> m_subspaceForRequireResolveFunction;
     std::unique_ptr<IsoSubspace> m_subspaceForBundlerPlugin;
     std::unique_ptr<IsoSubspace> m_subspaceForNodeVMScript;
@@ -37,6 +38,7 @@ public:
     std::unique_ptr<IsoSubspace> m_subspaceForJSMockImplementation;
     std::unique_ptr<IsoSubspace> m_subspaceForJSMockFunction;
     std::unique_ptr<IsoSubspace> m_subspaceForMockWithImplementationCleanupData;
+    std::unique_ptr<IsoSubspace> m_subspaceForProcessObject;
 
 #include "ZigGeneratedClasses+DOMIsoSubspaces.h"
     /*-- BUN --*/
diff --git a/src/bun.js/bindings/webcore/EventEmitter.cpp b/src/bun.js/bindings/webcore/EventEmitter.cpp
index 0650d624c..0e273042b 100644
--- a/src/bun.js/bindings/webcore/EventEmitter.cpp
+++ b/src/bun.js/bindings/webcore/EventEmitter.cpp
@@ -35,6 +35,8 @@ bool EventEmitter::addListener(const Identifier& eventType, Ref<EventListener>&&
     }
 
     eventListenersDidChange();
+    if (this->onDidChangeListener)
+        this->onDidChangeListener(*this, eventType, true);
     return true;
 }
 
@@ -62,6 +64,9 @@ bool EventEmitter::removeListener(const Identifier& eventType, EventListener& li
 
     if (data->eventListenerMap.remove(eventType, listener)) {
         eventListenersDidChange();
+
+        if (this->onDidChangeListener)
+            this->onDidChangeListener(*this, eventType, false);
         return true;
     }
     return false;
@@ -93,6 +98,8 @@ bool EventEmitter::removeAllListeners(const Identifier& eventType)
 
     if (data->eventListenerMap.removeAll(eventType)) {
         eventListenersDidChange();
+        if (this->onDidChangeListener)
+            this->onDidChangeListener(*this, eventType, false);
         return true;
     }
     return false;
diff --git a/src/bun.js/bindings/webcore/EventEmitter.h b/src/bun.js/bindings/webcore/EventEmitter.h
index b46bcff5d..8db59c188 100644
--- a/src/bun.js/bindings/webcore/EventEmitter.h
+++ b/src/bun.js/bindings/webcore/EventEmitter.h
@@ -67,6 +67,8 @@ public:
     bool hasActiveEventListeners(const Identifier& eventType) const;
     bool hasEventListeners(JSC::VM& vm, ASCIILiteral eventType) const;
 
+    WTF::Function<void(EventEmitter&, const Identifier& eventName, bool isAdded)> onDidChangeListener = WTF::Function<void(EventEmitter&, const Identifier& eventName, bool isAdded)>(nullptr);
+
     unsigned getMaxListeners() const { return m_maxListeners; };
 
     void setMaxListeners(unsigned count);
@@ -101,7 +103,9 @@ private:
     EventEmitterData* eventTargetData() { return &m_eventTargetData; }
     EventEmitterData* eventTargetDataConcurrently() { return &m_eventTargetData; }
     EventEmitterData& ensureEventEmitterData() { return m_eventTargetData; }
-    void eventListenersDidChange() {}
+    void eventListenersDidChange()
+    {
+    }
 
     void innerInvokeEventListeners(const Identifier&, SimpleEventListenerVector, const MarkedArgumentBuffer& arguments);
     void invalidateEventListenerRegions();
diff --git a/src/bun.js/bindings/webcore/JSCloseEvent.cpp b/src/bun.js/bindings/webcore/JSCloseEvent.cpp
index be07cbcfe..ad7b6ed57 100644
--- a/src/bun.js/bindings/webcore/JSCloseEvent.cpp
+++ b/src/bun.js/bindings/webcore/JSCloseEvent.cpp
@@ -99,7 +99,7 @@ template<> CloseEvent::Init convertDictionary<CloseEvent::Init>(JSGlobalObject&
     if (isNullOrUndefined)
         codeValue = jsUndefined();
     else {
-        codeValue = object->get(&lexicalGlobalObject, Identifier::fromString(vm, "code"_s));
+        codeValue = object->get(&lexicalGlobalObject, WebCore::builtinNames(vm).codePublicName());
         RETURN_IF_EXCEPTION(throwScope, {});
     }
     if (!codeValue.isUndefined()) {
diff --git a/src/bun.js/bindings/webcore/JSEventEmitter.cpp b/src/bun.js/bindings/webcore/JSEventEmitter.cpp
index 995d845cf..959cbd8d7 100644
--- a/src/bun.js/bindings/webcore/JSEventEmitter.cpp
+++ b/src/bun.js/bindings/webcore/JSEventEmitter.cpp
@@ -148,8 +148,8 @@ static const HashTableValue JSEventEmitterPrototypeTableValues[] = {
     { "addListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_addListener, 2 } },
     { "on"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_addListener, 2 } },
     { "once"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_addOnceListener, 2 } },
-    { "prepend"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependListener, 2 } },
-    { "prependOnce"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependOnceListener, 2 } },
+    { "prependListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependListener, 2 } },
+    { "prependOnceListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_prependOnceListener, 2 } },
     { "removeListener"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_removeListener, 2 } },
     { "off"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_removeListener, 2 } },
     { "removeAllListeners"_s, static_cast<unsigned>(JSC::PropertyAttribute::Function), NoIntrinsic, { HashTableValue::NativeFunctionType, jsEventEmitterPrototypeFunction_removeAllListeners, 1 } },
@@ -219,7 +219,7 @@ JSC_DEFINE_CUSTOM_GETTER(jsEventEmitterConstructor, (JSGlobalObject * lexicalGlo
     return JSValue::encode(JSEventEmitter::getConstructor(JSC::getVM(lexicalGlobalObject), prototype->globalObject()));
 }
 
-static inline JSC::EncodedJSValue addListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis, bool once, bool prepend)
+inline JSC::EncodedJSValue JSEventEmitter::addListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis, bool once, bool prepend)
 {
     auto& vm = JSC::getVM(lexicalGlobalObject);
     auto throwScope = DECLARE_THROW_SCOPE(vm);
@@ -251,7 +251,7 @@ static inline JSC::EncodedJSValue addListener(JSC::JSGlobalObject* lexicalGlobal
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_addListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, false, false);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, false, false);
 }
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_setMaxListenersBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
@@ -280,17 +280,17 @@ static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_getMaxListener
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_addOnceListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, true, false);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, true, false);
 }
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_prependListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, false, true);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, false, true);
 }
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_prependOnceListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
-    return addListener(lexicalGlobalObject, callFrame, castedThis, true, true);
+    return JSEventEmitter::addListener(lexicalGlobalObject, callFrame, castedThis, true, true);
 }
 
 JSC_DEFINE_HOST_FUNCTION(jsEventEmitterPrototypeFunction_addListener, (JSGlobalObject * lexicalGlobalObject, CallFrame* callFrame))
@@ -325,6 +325,11 @@ JSC_DEFINE_HOST_FUNCTION(jsEventEmitterPrototypeFunction_prependOnceListener, (J
 
 static inline JSC::EncodedJSValue jsEventEmitterPrototypeFunction_removeListenerBody(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, typename IDLOperation<JSEventEmitter>::ClassParameter castedThis)
 {
+    return JSEventEmitter::removeListener(lexicalGlobalObject, callFrame, castedThis);
+}
+
+inline JSC::EncodedJSValue JSEventEmitter::removeListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis)
+{
     auto& vm = JSC::getVM(lexicalGlobalObject);
     auto throwScope = DECLARE_THROW_SCOPE(vm);
     JSC::JSValue actualThis = callFrame->thisValue();
diff --git a/src/bun.js/bindings/webcore/JSEventEmitter.h b/src/bun.js/bindings/webcore/JSEventEmitter.h
index 855241011..30d62d792 100644
--- a/src/bun.js/bindings/webcore/JSEventEmitter.h
+++ b/src/bun.js/bindings/webcore/JSEventEmitter.h
@@ -27,6 +27,9 @@ public:
     static EventEmitter* toWrapped(JSC::VM&, JSC::JSValue);
     static void destroy(JSC::JSCell*);
 
+    static inline JSC::EncodedJSValue addListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis, bool once, bool prepend);
+    static inline JSC::EncodedJSValue removeListener(JSC::JSGlobalObject* lexicalGlobalObject, JSC::CallFrame* callFrame, JSEventEmitter* castedThis);
+
     DECLARE_INFO;
 
     static JSC::Structure* createStructure(JSC::VM& vm, JSC::JSGlobalObject* globalObject, JSC::JSValue prototype)
diff --git a/src/bun.js/bindings/webcore/WebSocket.cpp b/src/bun.js/bindings/webcore/WebSocket.cpp
index a346175df..1d6392f44 100644
--- a/src/bun.js/bindings/webcore/WebSocket.cpp
+++ b/src/bun.js/bindings/webcore/WebSocket.cpp
@@ -458,8 +458,8 @@ ExceptionOr<void> WebSocket::send(const String& message)
         return {};
     }
 
-    if (message.length() > 0)
-        this->sendWebSocketString(message);
+    // 0-length is allowed
+    this->sendWebSocketString(message);
 
     return {};
 }
@@ -477,8 +477,8 @@ ExceptionOr<void> WebSocket::send(ArrayBuffer& binaryData)
     }
     char* data = static_cast<char*>(binaryData.data());
     size_t length = binaryData.byteLength();
-    if (length > 0)
-        this->sendWebSocketData(data, length);
+    // 0-length is allowed
+    this->sendWebSocketData(data, length);
     return {};
 }
 
@@ -498,8 +498,8 @@ ExceptionOr<void> WebSocket::send(ArrayBufferView& arrayBufferView)
     auto buffer = arrayBufferView.unsharedBuffer().get();
     char* baseAddress = reinterpret_cast<char*>(buffer->data()) + arrayBufferView.byteOffset();
     size_t length = arrayBufferView.byteLength();
-    if (length > 0)
-        this->sendWebSocketData(baseAddress, length);
+    // 0-length is allowed
+    this->sendWebSocketData(baseAddress, length);
 
     return {};
 }
@@ -1232,14 +1232,19 @@ extern "C" void WebSocket__didCloseWithErrorCode(WebCore::WebSocket* webSocket,
 
 extern "C" void WebSocket__didReceiveText(WebCore::WebSocket* webSocket, bool clone, const ZigString* str)
 {
-    WTF::String wtf_str = Zig::toString(*str);
-    if (clone) {
-        wtf_str = wtf_str.isolatedCopy();
-    }
-
+    WTF::String wtf_str = clone ? Zig::toStringCopy(*str) : Zig::toString(*str);
     webSocket->didReceiveMessage(WTFMove(wtf_str));
 }
 extern "C" void WebSocket__didReceiveBytes(WebCore::WebSocket* webSocket, uint8_t* bytes, size_t len)
 {
     webSocket->didReceiveBinaryData({ bytes, len });
 }
+
+extern "C" void WebSocket__incrementPendingActivity(WebCore::WebSocket* webSocket)
+{
+    webSocket->incPendingActivityCount();
+}
+extern "C" void WebSocket__decrementPendingActivity(WebCore::WebSocket* webSocket)
+{
+    webSocket->decPendingActivityCount();
+}
+\ No newline at end of file
diff --git a/src/bun.js/bindings/webcore/WebSocket.h b/src/bun.js/bindings/webcore/WebSocket.h
index 42261cfc4..846bd186b 100644
--- a/src/bun.js/bindings/webcore/WebSocket.h
+++ b/src/bun.js/bindings/webcore/WebSocket.h
@@ -111,6 +111,20 @@ public:
         return m_hasPendingActivity.load();
     }
 
+    void incPendingActivityCount()
+    {
+        m_pendingActivityCount++;
+        ref();
+        updateHasPendingActivity();
+    }
+
+    void decPendingActivityCount()
+    {
+        m_pendingActivityCount--;
+        deref();
+        updateHasPendingActivity();
+    }
+
 private:
     typedef union AnyWebSocket {
         WebSocketClient* client;
@@ -147,20 +161,6 @@ private:
     void sendWebSocketString(const String& message);
     void sendWebSocketData(const char* data, size_t length);
 
-    void incPendingActivityCount()
-    {
-        m_pendingActivityCount++;
-        ref();
-        updateHasPendingActivity();
-    }
-
-    void decPendingActivityCount()
-    {
-        m_pendingActivityCount--;
-        deref();
-        updateHasPendingActivity();
-    }
-
     void failAsynchronously();
 
     enum class BinaryType { Blob,